In [3]:
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
In [2]:
#upload all files from the lib folder here
from google.colab import files
def getLocalFiles():
    _files = files.upload()
    if len(_files) >0:
       for k,v in _files.items():
         open(k,'wb').write(v)
getLocalFiles()
Upload widget is only available when the cell has been executed in the current browser session. Please rerun this cell to enable.
Saving app.py to app.py
Saving automation.py to automation.py
Saving data_augmentation.py to data_augmentation.py
Saving desktop.ini to desktop.ini
Saving encode_data.py to encode_data.py
Saving feature_extraction.py to feature_extraction.py
Saving preprocess_data.py to preprocess_data.py
Saving train_DL_model.py to train_DL_model.py
Saving train_ml_model.py to train_ml_model.py
Saving vectorizer.py to vectorizer.py
In [ ]:
!pip install nltk
import nltk
nltk.download('all')
!pip install catboost
!pip install xgboost
!pip install jsonify
!pip install requests
!pip install flask-ngrok
!pip install werkzeug
!pip install simplejson
!pip install Flask-Session
!!pip install ChatterBot
Requirement already satisfied: nltk in /usr/local/lib/python3.7/dist-packages (3.2.5)
Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from nltk) (1.15.0)
[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_esp.zip.
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/chat80.zip.
[nltk_data]    | Downloading package city_database to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/city_database.zip.
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package comparative_sentences to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/comparative_sentences.zip.
[nltk_data]    | Downloading package comtrans to /root/nltk_data...
[nltk_data]    | Downloading package conll2000 to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/conll2000.zip.
[nltk_data]    | Downloading package conll2002 to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/conll2002.zip.
[nltk_data]    | Downloading package conll2007 to /root/nltk_data...
[nltk_data]    | Downloading package crubadan to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/crubadan.zip.
[nltk_data]    | Downloading package dependency_treebank to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/dependency_treebank.zip.
[nltk_data]    | Downloading package dolch to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/dolch.zip.
[nltk_data]    | Downloading package europarl_raw to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/europarl_raw.zip.
[nltk_data]    | Downloading package floresta to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/floresta.zip.
[nltk_data]    | Downloading package framenet_v15 to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/framenet_v15.zip.
[nltk_data]    | Downloading package framenet_v17 to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/framenet_v17.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package ieer to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/ieer.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package indian to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/indian.zip.
[nltk_data]    | Downloading package jeita to /root/nltk_data...
[nltk_data]    | Downloading package kimmo to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/kimmo.zip.
[nltk_data]    | Downloading package knbc to /root/nltk_data...
[nltk_data]    | Downloading package lin_thesaurus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/lin_thesaurus.zip.
[nltk_data]    | Downloading package mac_morpho to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/mac_morpho.zip.
[nltk_data]    | Downloading package machado to /root/nltk_data...
[nltk_data]    | Downloading package masc_tagged to /root/nltk_data...
[nltk_data]    | Downloading package moses_sample to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping models/moses_sample.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package nombank.1.0 to /root/nltk_data...
[nltk_data]    | Downloading package nps_chat to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/nps_chat.zip.
[nltk_data]    | Downloading package omw to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/omw.zip.
[nltk_data]    | Downloading package opinion_lexicon to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/opinion_lexicon.zip.
[nltk_data]    | Downloading package paradigms to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/paradigms.zip.
[nltk_data]    | Downloading package pil to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/pil.zip.
[nltk_data]    | Downloading package pl196x to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/pl196x.zip.
[nltk_data]    | Downloading package ppattach to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/ppattach.zip.
[nltk_data]    | Downloading package problem_reports to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/problem_reports.zip.
[nltk_data]    | Downloading package propbank to /root/nltk_data...
[nltk_data]    | Downloading package ptb to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/ptb.zip.
[nltk_data]    | Downloading package product_reviews_1 to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/product_reviews_1.zip.
[nltk_data]    | Downloading package product_reviews_2 to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/product_reviews_2.zip.
[nltk_data]    | Downloading package pros_cons to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/pros_cons.zip.
[nltk_data]    | Downloading package qc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/qc.zip.
[nltk_data]    | Downloading package reuters to /root/nltk_data...
[nltk_data]    | Downloading package rte to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/rte.zip.
[nltk_data]    | Downloading package semcor to /root/nltk_data...
[nltk_data]    | Downloading package senseval to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/senseval.zip.
[nltk_data]    | Downloading package sentiwordnet to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/sentiwordnet.zip.
[nltk_data]    | Downloading package sentence_polarity to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/sentence_polarity.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/shakespeare.zip.
[nltk_data]    | Downloading package sinica_treebank to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/sinica_treebank.zip.
[nltk_data]    | Downloading package smultron to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/smultron.zip.
[nltk_data]    | Downloading package state_union to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/state_union.zip.
[nltk_data]    | Downloading package stopwords to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/stopwords.zip.
[nltk_data]    | Downloading package subjectivity to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/subjectivity.zip.
[nltk_data]    | Downloading package swadesh to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/swadesh.zip.
[nltk_data]    | Downloading package switchboard to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/switchboard.zip.
[nltk_data]    | Downloading package timit to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/timit.zip.
[nltk_data]    | Downloading package toolbox to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/toolbox.zip.
[nltk_data]    | Downloading package treebank to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/treebank.zip.
[nltk_data]    | Downloading package twitter_samples to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/twitter_samples.zip.
[nltk_data]    | Downloading package udhr to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/udhr.zip.
[nltk_data]    | Downloading package udhr2 to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/udhr2.zip.
[nltk_data]    | Downloading package unicode_samples to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/unicode_samples.zip.
[nltk_data]    | Downloading package universal_treebanks_v20 to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    | Downloading package verbnet to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/verbnet.zip.
[nltk_data]    | Downloading package verbnet3 to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/verbnet3.zip.
[nltk_data]    | Downloading package webtext to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/webtext.zip.
[nltk_data]    | Downloading package wordnet to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/wordnet.zip.
[nltk_data]    | Downloading package wordnet_ic to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/wordnet_ic.zip.
[nltk_data]    | Downloading package words to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/words.zip.
[nltk_data]    | Downloading package ycoe to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/ycoe.zip.
[nltk_data]    | Downloading package rslp to /root/nltk_data...
[nltk_data]    |   Unzipping stemmers/rslp.zip.
[nltk_data]    | Downloading package maxent_treebank_pos_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/maxent_treebank_pos_tagger.zip.
[nltk_data]    | Downloading package universal_tagset to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/universal_tagset.zip.
[nltk_data]    | Downloading package maxent_ne_chunker to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data]    | Downloading package punkt to /root/nltk_data...
[nltk_data]    |   Unzipping tokenizers/punkt.zip.
[nltk_data]    | Downloading package book_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/book_grammars.zip.
[nltk_data]    | Downloading package sample_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/sample_grammars.zip.
[nltk_data]    | Downloading package spanish_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/spanish_grammars.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package large_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/large_grammars.zip.
[nltk_data]    | Downloading package tagsets to /root/nltk_data...
[nltk_data]    |   Unzipping help/tagsets.zip.
[nltk_data]    | Downloading package snowball_data to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    | Downloading package bllip_wsj_no_aux to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping models/bllip_wsj_no_aux.zip.
[nltk_data]    | Downloading package word2vec_sample to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping models/word2vec_sample.zip.
[nltk_data]    | Downloading package panlex_swadesh to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    | Downloading package mte_teip5 to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/mte_teip5.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package perluniprops to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping misc/perluniprops.zip.
[nltk_data]    | Downloading package nonbreaking_prefixes to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/nonbreaking_prefixes.zip.
[nltk_data]    | Downloading package vader_lexicon to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    | Downloading package porter_test to /root/nltk_data...
[nltk_data]    |   Unzipping stemmers/porter_test.zip.
[nltk_data]    | Downloading package wmt15_eval to /root/nltk_data...
[nltk_data]    |   Unzipping models/wmt15_eval.zip.
[nltk_data]    | Downloading package mwa_ppdb to /root/nltk_data...
[nltk_data]    |   Unzipping misc/mwa_ppdb.zip.
[nltk_data]    | 
[nltk_data]  Done downloading collection all
Collecting catboost
  Downloading catboost-0.26-cp37-none-manylinux1_x86_64.whl (69.2 MB)
     |████████████████████████████████| 69.2 MB 5.2 kB/s 
Requirement already satisfied: scipy in /usr/local/lib/python3.7/dist-packages (from catboost) (1.4.1)
Requirement already satisfied: graphviz in /usr/local/lib/python3.7/dist-packages (from catboost) (0.10.1)
Requirement already satisfied: pandas>=0.24.0 in /usr/local/lib/python3.7/dist-packages (from catboost) (1.1.5)
Requirement already satisfied: matplotlib in /usr/local/lib/python3.7/dist-packages (from catboost) (3.2.2)
Requirement already satisfied: numpy>=1.16.0 in /usr/local/lib/python3.7/dist-packages (from catboost) (1.19.5)
Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from catboost) (1.15.0)
Requirement already satisfied: plotly in /usr/local/lib/python3.7/dist-packages (from catboost) (4.4.1)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.24.0->catboost) (2018.9)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.24.0->catboost) (2.8.1)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->catboost) (1.3.1)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->catboost) (2.4.7)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib->catboost) (0.10.0)
Requirement already satisfied: retrying>=1.3.3 in /usr/local/lib/python3.7/dist-packages (from plotly->catboost) (1.3.3)
Installing collected packages: catboost
Successfully installed catboost-0.26
Requirement already satisfied: xgboost in /usr/local/lib/python3.7/dist-packages (0.90)
Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from xgboost) (1.19.5)
Requirement already satisfied: scipy in /usr/local/lib/python3.7/dist-packages (from xgboost) (1.4.1)
Collecting jsonify
  Downloading jsonify-0.5.tar.gz (1.0 kB)
Building wheels for collected packages: jsonify
  Building wheel for jsonify (setup.py) ... done
  Created wheel for jsonify: filename=jsonify-0.5-py3-none-any.whl size=1562 sha256=8728f1ad41f2c9d92003f6f96e9850bef30b5dd9fb46981c9c34fb64d72a36b3
  Stored in directory: /root/.cache/pip/wheels/a7/15/e0/a5eb19cf0496a9ea2bddb0c9f1dc324559385806b75400988b
Successfully built jsonify
Installing collected packages: jsonify
Successfully installed jsonify-0.5
Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (2.23.0)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests) (3.0.4)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests) (2021.5.30)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests) (2.10)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests) (1.24.3)
Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Requirement already satisfied: Flask>=0.8 in /usr/local/lib/python3.7/dist-packages (from flask-ngrok) (1.1.4)
Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from flask-ngrok) (2.23.0)
Requirement already satisfied: Jinja2<3.0,>=2.10.1 in /usr/local/lib/python3.7/dist-packages (from Flask>=0.8->flask-ngrok) (2.11.3)
Requirement already satisfied: Werkzeug<2.0,>=0.15 in /usr/local/lib/python3.7/dist-packages (from Flask>=0.8->flask-ngrok) (1.0.1)
Requirement already satisfied: itsdangerous<2.0,>=0.24 in /usr/local/lib/python3.7/dist-packages (from Flask>=0.8->flask-ngrok) (1.1.0)
Requirement already satisfied: click<8.0,>=5.1 in /usr/local/lib/python3.7/dist-packages (from Flask>=0.8->flask-ngrok) (7.1.2)
Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from Jinja2<3.0,>=2.10.1->Flask>=0.8->flask-ngrok) (2.0.1)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->flask-ngrok) (2021.5.30)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->flask-ngrok) (1.24.3)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->flask-ngrok) (3.0.4)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->flask-ngrok) (2.10)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25
Requirement already satisfied: werkzeug in /usr/local/lib/python3.7/dist-packages (1.0.1)
Collecting simplejson
  Downloading simplejson-3.17.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (129 kB)
     |████████████████████████████████| 129 kB 4.3 MB/s 
Installing collected packages: simplejson
Successfully installed simplejson-3.17.3
Collecting Flask-Session
  Downloading Flask_Session-0.4.0-py2.py3-none-any.whl (7.5 kB)
Collecting cachelib
  Downloading cachelib-0.2.0-py3-none-any.whl (12 kB)
Requirement already satisfied: Flask>=0.8 in /usr/local/lib/python3.7/dist-packages (from Flask-Session) (1.1.4)
Requirement already satisfied: Jinja2<3.0,>=2.10.1 in /usr/local/lib/python3.7/dist-packages (from Flask>=0.8->Flask-Session) (2.11.3)
Requirement already satisfied: click<8.0,>=5.1 in /usr/local/lib/python3.7/dist-packages (from Flask>=0.8->Flask-Session) (7.1.2)
Requirement already satisfied: itsdangerous<2.0,>=0.24 in /usr/local/lib/python3.7/dist-packages (from Flask>=0.8->Flask-Session) (1.1.0)
Requirement already satisfied: Werkzeug<2.0,>=0.15 in /usr/local/lib/python3.7/dist-packages (from Flask>=0.8->Flask-Session) (1.0.1)
Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from Jinja2<3.0,>=2.10.1->Flask>=0.8->Flask-Session) (2.0.1)
Installing collected packages: cachelib, Flask-Session
Successfully installed Flask-Session-0.4.0 cachelib-0.2.0
Out[ ]:
['Collecting ChatterBot',
 '  Downloading ChatterBot-1.0.8-py2.py3-none-any.whl (63 kB)',
 '\x1b[?25l',
 '\x1b[K     |█████▏                          | 10 kB 17.9 MB/s eta 0:00:01',
 '\x1b[K     |██████████▎                     | 20 kB 10.2 MB/s eta 0:00:01',
 '\x1b[K     |███████████████▌                | 30 kB 8.6 MB/s eta 0:00:01',
 '\x1b[K     |████████████████████▋           | 40 kB 7.9 MB/s eta 0:00:01',
 '\x1b[K     |█████████████████████████▊      | 51 kB 4.1 MB/s eta 0:00:01',
 '\x1b[K     |███████████████████████████████ | 61 kB 4.4 MB/s eta 0:00:01',
 '\x1b[K     |████████████████████████████████| 63 kB 1.4 MB/s ',
 '\x1b[?25hRequirement already satisfied: python-dateutil<2.9,>=2.8 in /usr/local/lib/python3.7/dist-packages (from ChatterBot) (2.8.1)',
 'Collecting sqlalchemy<1.4,>=1.3',
 '  Downloading SQLAlchemy-1.3.24-cp37-cp37m-manylinux2010_x86_64.whl (1.3 MB)',
 '\x1b[?25l',
 '\x1b[K     |▎                               | 10 kB 26.5 MB/s eta 0:00:01',
 '\x1b[K     |▌                               | 20 kB 10.4 MB/s eta 0:00:01',
 '\x1b[K     |▊                               | 30 kB 14.0 MB/s eta 0:00:01',
 '\x1b[K     |█                               | 40 kB 8.6 MB/s eta 0:00:01',
 '\x1b[K     |█▎                              | 51 kB 7.7 MB/s eta 0:00:01',
 '\x1b[K     |█▌                              | 61 kB 9.0 MB/s eta 0:00:01',
 '\x1b[K     |█▉                              | 71 kB 6.1 MB/s eta 0:00:01',
 '\x1b[K     |██                              | 81 kB 6.9 MB/s eta 0:00:01',
 '\x1b[K     |██▎                             | 92 kB 6.9 MB/s eta 0:00:01',
 '\x1b[K     |██▋                             | 102 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |██▉                             | 112 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |███                             | 122 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |███▍                            | 133 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |███▋                            | 143 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |███▉                            | 153 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |████▏                           | 163 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |████▍                           | 174 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |████▋                           | 184 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |█████                           | 194 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |█████▏                          | 204 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |█████▍                          | 215 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |█████▊                          | 225 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |██████                          | 235 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |██████▏                         | 245 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |██████▌                         | 256 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |██████▊                         | 266 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |███████                         | 276 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |███████▎                        | 286 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |███████▌                        | 296 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |███████▊                        | 307 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |████████                        | 317 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |████████▎                       | 327 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |████████▌                       | 337 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |████████▉                       | 348 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |█████████                       | 358 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |█████████▎                      | 368 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |█████████▋                      | 378 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |█████████▉                      | 389 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |██████████                      | 399 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |██████████▎                     | 409 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |██████████▋                     | 419 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |██████████▉                     | 430 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |███████████                     | 440 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |███████████▍                    | 450 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |███████████▋                    | 460 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |███████████▉                    | 471 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |████████████▏                   | 481 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |████████████▍                   | 491 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |████████████▋                   | 501 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |█████████████                   | 512 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |█████████████▏                  | 522 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |█████████████▍                  | 532 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |█████████████▊                  | 542 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |██████████████                  | 552 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |██████████████▏                 | 563 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |██████████████▌                 | 573 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |██████████████▊                 | 583 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |███████████████                 | 593 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |███████████████▎                | 604 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |███████████████▌                | 614 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |███████████████▊                | 624 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |████████████████                | 634 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |████████████████▎               | 645 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |████████████████▌               | 655 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |████████████████▉               | 665 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |█████████████████               | 675 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |█████████████████▎              | 686 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |█████████████████▋              | 696 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |█████████████████▉              | 706 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |██████████████████              | 716 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |██████████████████▍             | 727 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |██████████████████▋             | 737 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |██████████████████▉             | 747 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |███████████████████▏            | 757 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |███████████████████▍            | 768 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |███████████████████▋            | 778 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |████████████████████            | 788 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |████████████████████▏           | 798 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |████████████████████▍           | 808 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |████████████████████▋           | 819 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |█████████████████████           | 829 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |█████████████████████▏          | 839 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |█████████████████████▍          | 849 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |█████████████████████▊          | 860 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |██████████████████████          | 870 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |██████████████████████▏         | 880 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |██████████████████████▌         | 890 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |██████████████████████▊         | 901 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |███████████████████████         | 911 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |███████████████████████▎        | 921 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |███████████████████████▌        | 931 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |███████████████████████▊        | 942 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |████████████████████████        | 952 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |████████████████████████▎       | 962 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |████████████████████████▌       | 972 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |████████████████████████▉       | 983 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |█████████████████████████       | 993 kB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |█████████████████████████▎      | 1.0 MB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |█████████████████████████▋      | 1.0 MB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |█████████████████████████▉      | 1.0 MB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |██████████████████████████      | 1.0 MB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |██████████████████████████▍     | 1.0 MB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |██████████████████████████▋     | 1.1 MB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |██████████████████████████▉     | 1.1 MB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |███████████████████████████▏    | 1.1 MB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |███████████████████████████▍    | 1.1 MB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |███████████████████████████▋    | 1.1 MB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |████████████████████████████    | 1.1 MB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |████████████████████████████▏   | 1.1 MB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |████████████████████████████▍   | 1.1 MB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |████████████████████████████▊   | 1.1 MB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |█████████████████████████████   | 1.1 MB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |█████████████████████████████▏  | 1.2 MB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |█████████████████████████████▌  | 1.2 MB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |█████████████████████████████▊  | 1.2 MB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |██████████████████████████████  | 1.2 MB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |██████████████████████████████▏ | 1.2 MB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |██████████████████████████████▌ | 1.2 MB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |██████████████████████████████▊ | 1.2 MB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |███████████████████████████████ | 1.2 MB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |███████████████████████████████▎| 1.2 MB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |███████████████████████████████▌| 1.2 MB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |███████████████████████████████▊| 1.3 MB 7.2 MB/s eta 0:00:01',
 '\x1b[K     |████████████████████████████████| 1.3 MB 7.2 MB/s ',
 '\x1b[?25hRequirement already satisfied: pytz in /usr/local/lib/python3.7/dist-packages (from ChatterBot) (2018.9)',
 'Collecting mathparse<0.2,>=0.1',
 '  Downloading mathparse-0.1.2-py3-none-any.whl (7.2 kB)',
 'Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil<2.9,>=2.8->ChatterBot) (1.15.0)',
 'Installing collected packages: sqlalchemy, mathparse, ChatterBot',
 '  Attempting uninstall: sqlalchemy',
 '    Found existing installation: SQLAlchemy 1.4.20',
 '    Uninstalling SQLAlchemy-1.4.20:',
 '      Successfully uninstalled SQLAlchemy-1.4.20',
 'Successfully installed ChatterBot-1.0.8 mathparse-0.1.2 sqlalchemy-1.3.24']
In [ ]:
#sys.path.append('C:/Applications/Machine Learning/NLP/CapstoneProjectNLP/lib')
import sys
sys.path.append('../')
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Out[ ]:
True
In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import nltk
import data_augmentation
import preprocess_data 
import train_ml_model
import train_DL_model
import encode_data
import vectorizer
import time
import feature_extraction
from data_augmentation import *
from preprocess_data import *
from train_ml_model import *
from train_DL_model import *
from encode_data import *
from vectorizer import *
from feature_extraction import *
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score, precision_score, roc_auc_score
from nltk import word_tokenize, pos_tag, pos_tag_sents
from sklearn import metrics
from sklearn.metrics import mean_squared_error,log_loss
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from keras.initializers import Constant
from keras.layers import Embedding,LSTM,Dense,Dropout,Bidirectional,Input,GlobalMaxPool1D,SpatialDropout1D
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils
from keras.models import Model,load_model
import keras.optimizers
from keras.optimizers import SGD
from keras.models import Sequential
from keras.layers import Flatten
from keras.layers import Dense
from keras.initializers import Constant
from keras.callbacks import ReduceLROnPlateau,EarlyStopping
from keras.layers import Embedding,LSTM,Dense,Dropout,Bidirectional
import keras.optimizers
import tensorflow as tf
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier,BaggingClassifier,GradientBoostingClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import multilabel_confusion_matrix,classification_report,confusion_matrix,accuracy_score,f1_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
/usr/local/lib/python3.7/dist-packages/sklearn/externals/joblib/__init__.py:15: FutureWarning:

sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.

In [ ]:
# accident_safety_data=pd.read_csv("C:/Applications/Machine Learning/NLP/CapstoneProjectNLP/data/hse_data.csv")
accident_safety_data=pd.read_csv("/content/hse_data.csv")
In [ ]:
accident_safety_data.head()
Out[ ]:
Unnamed: 0 Data Countries Local Industry Sector Accident Level Potential Accident Level Genre Employee or Third Party Critical Risk Description
0 0 2016-01-01 00:00:00 Country_01 Local_01 Mining I IV Male Third Party Pressed While removing the drill rod of the Jumbo 08 f...
1 1 2016-01-02 00:00:00 Country_02 Local_02 Mining I IV Male Employee Pressurized Systems During the activation of a sodium sulphide pum...
2 2 2016-01-06 00:00:00 Country_01 Local_03 Mining I III Male Third Party (Remote) Manual Tools In the sub-station MILPO located at level +170...
3 3 2016-01-08 00:00:00 Country_01 Local_04 Mining I I Male Third Party Others Being 9:45 am. approximately in the Nv. 1880 C...
4 4 2016-01-10 00:00:00 Country_01 Local_04 Mining IV IV Male Third Party Others Approximately at 11:45 a.m. in circumstances t...
In [ ]:
accident_safety_data.columns
Out[ ]:
Index(['Unnamed: 0', 'Data', 'Countries', 'Local', 'Industry Sector',
       'Accident Level', 'Potential Accident Level', 'Genre',
       'Employee or Third Party', 'Critical Risk', 'Description'],
      dtype='object')

We can see that the columns "Unnamed" is unwanted, as it will not help us in our analysis. Also, Data column should be renamed to "Date". Therefore, let's drop the column "Unnamed" and rename the column "Data" to "Date"

In [ ]:
#create a backup of the dataset before we make any changes to it
accident_safety_data_new=accident_safety_data.copy(True)
accident_safety_data_new.head()
Out[ ]:
Unnamed: 0 Data Countries Local Industry Sector Accident Level Potential Accident Level Genre Employee or Third Party Critical Risk Description
0 0 2016-01-01 00:00:00 Country_01 Local_01 Mining I IV Male Third Party Pressed While removing the drill rod of the Jumbo 08 f...
1 1 2016-01-02 00:00:00 Country_02 Local_02 Mining I IV Male Employee Pressurized Systems During the activation of a sodium sulphide pum...
2 2 2016-01-06 00:00:00 Country_01 Local_03 Mining I III Male Third Party (Remote) Manual Tools In the sub-station MILPO located at level +170...
3 3 2016-01-08 00:00:00 Country_01 Local_04 Mining I I Male Third Party Others Being 9:45 am. approximately in the Nv. 1880 C...
4 4 2016-01-10 00:00:00 Country_01 Local_04 Mining IV IV Male Third Party Others Approximately at 11:45 a.m. in circumstances t...
In [ ]:
#dropping "Unnamed" column
accident_safety_data_new.drop('Unnamed: 0',axis='columns', inplace=True)
#renaming "Data" column to "Date"
accident_safety_data_new.rename(columns = {'Data':'Date'}, inplace = True)
#renaming "Genre" column to "Gender"
accident_safety_data_new.rename(columns = {'Genre':'Gender'}, inplace = True)
#renaming "Employee or Third party" column to "Employee Type"
accident_safety_data_new.rename(columns = {'Employee or Third Party':'Employee Type'}, inplace = True)
In [ ]:
accident_safety_data_new.duplicated().sum()
Out[ ]:
7
In [ ]:
#Let us view the duplicate records
duplicates = accident_safety_data_new.duplicated()
accident_safety_data_new[duplicates]
Out[ ]:
Date Countries Local Industry Sector Accident Level Potential Accident Level Gender Employee Type Critical Risk Description
77 2016-04-01 00:00:00 Country_01 Local_01 Mining I V Male Third Party (Remote) Others In circumstances that two workers of the Abrat...
262 2016-12-01 00:00:00 Country_01 Local_03 Mining I IV Male Employee Others During the activity of chuteo of ore in hopper...
303 2017-01-21 00:00:00 Country_02 Local_02 Mining I I Male Third Party (Remote) Others Employees engaged in the removal of material f...
345 2017-03-02 00:00:00 Country_03 Local_10 Others I I Male Third Party Venomous Animals On 02/03/17 during the soil sampling in the re...
346 2017-03-02 00:00:00 Country_03 Local_10 Others I I Male Third Party Venomous Animals On 02/03/17 during the soil sampling in the re...
355 2017-03-15 00:00:00 Country_03 Local_10 Others I I Male Third Party Venomous Animals Team of the VMS Project performed soil collect...
397 2017-05-23 00:00:00 Country_01 Local_04 Mining I IV Male Third Party Projection of fragments In moments when the 02 collaborators carried o...
In [ ]:
accident_safety_data_new.drop_duplicates(inplace=True)
In [ ]:
#Let us check the shape of our dataset
accident_safety_data_new.shape
Out[ ]:
(418, 10)

We can see that the dataset has 425 rows and 10 columns

In [ ]:
accident_safety_data_new.head()
Out[ ]:
Date Countries Local Industry Sector Accident Level Potential Accident Level Gender Employee Type Critical Risk Description
0 2016-01-01 00:00:00 Country_01 Local_01 Mining I IV Male Third Party Pressed While removing the drill rod of the Jumbo 08 f...
1 2016-01-02 00:00:00 Country_02 Local_02 Mining I IV Male Employee Pressurized Systems During the activation of a sodium sulphide pum...
2 2016-01-06 00:00:00 Country_01 Local_03 Mining I III Male Third Party (Remote) Manual Tools In the sub-station MILPO located at level +170...
3 2016-01-08 00:00:00 Country_01 Local_04 Mining I I Male Third Party Others Being 9:45 am. approximately in the Nv. 1880 C...
4 2016-01-10 00:00:00 Country_01 Local_04 Mining IV IV Male Third Party Others Approximately at 11:45 a.m. in circumstances t...
In [ ]:
#Let us check for missing values in the dataset
accident_safety_data_new.isna().apply(pd.value_counts)
Out[ ]:
Date Countries Local Industry Sector Accident Level Potential Accident Level Gender Employee Type Critical Risk Description
False 418 418 418 418 418 418 418 418 418 418

We can see that this dataset has no null values.

In [ ]:
#Let us now check the datatype of the dataset and also get to know some more details
accident_safety_data_new.dtypes
Out[ ]:
Date                        object
Countries                   object
Local                       object
Industry Sector             object
Accident Level              object
Potential Accident Level    object
Gender                      object
Employee Type               object
Critical Risk               object
Description                 object
dtype: object

Here, we can see that all the columns of the dataset are of "object" datatype. Coming to the type of data present in each column, we can see that there is a column "Date", which means it holds time series data. All other columns except "Description" are of categorical datatype.

In [ ]:
accident_safety_data_new.describe().T
Out[ ]:
count unique top freq
Date 418 287 2017-02-08 00:00:00 6
Countries 418 3 Country_01 248
Local 418 12 Local_03 89
Industry Sector 418 3 Mining 237
Accident Level 418 5 I 309
Potential Accident Level 418 6 IV 141
Gender 418 2 Male 396
Employee Type 418 3 Third Party 185
Critical Risk 418 33 Others 229
Description 418 411 When starting the activity of removing a coil ... 2

From the above table, we can infer the below:

  1. This dataset contains accident data of 3 countries, out of which Country1 has the most number of accidents.

  2. The data is collected from 3 types of industry sectors.Local_3 has the most number of accidents.

  3. There are 5 major accident levels in which this dataset has been classified.309 accidents are of accident level 1, making it the most frequent accident type. This also means that the data is not distributed evenly.

  4. The data is a consolidation of accidents faced by employees as well as third party vendors and others. Third party employees have faced the most number of accidents according to this dataset.

  5. 396 male employees have been reported to have accidents, which mean the distribution of data in this case is also not evenly balanced.

  6. 33 different types of critical risks have been identified in the dataset.

We have seen that there are quite a few categorical columns in the dataset which can be encoded to numerical values e.g.

  1. Local

  2. Accident Level

  3. Potential Accident Level

UNIVARIATE ANALYSIS

Let us check the distribution of data based on accident levels

In [ ]:
fig = px.histogram(accident_safety_data_new, x="Accident Level",title='Distribution by Accident Level')
fig.show()

We can see that the distribution of Accident Levels is highly imbalanced in the dataset

  1. Let us check the distribution of data based on country.
In [ ]:
fig = px.histogram(accident_safety_data_new, x="Countries" ,title='Distribution by countries')
fig.show()

We can see that "Country_01" has the most number of accident cases.

Let us now see the distribution of accidents with respect to the type of employee.(Employee/ThirdParty/ThirdPartyRemote)

In [ ]:
fig = px.histogram(accident_safety_data_new, x="Employee Type",title='Distribution by Employee type')
fig.show()

From the graph it is very clear that accidents have happened in almost equal proportions among permanent employees or third party contractors, with thrid party contractors a bit on the higher side.

Let us also check the distribution of accidents as per industry sector.

In [ ]:
fig = px.histogram(accident_safety_data_new, x="Industry Sector",title='Distribution by Industry sector')
fig.show()

We can see that majority of accidents have happened in the mining sector, followed by metal industry and other type of industries.

We will now see the distribution of accidents as per Gender

In [ ]:
fig = px.histogram(accident_safety_data_new, x="Gender",title='Distribution by Gender')
fig.show()

Clearly, the distribution of accidents is imbalanced when checked by "Genre". The count of accidents in males is way higher than that in females.

Lastly, let us check the distribution by Locals.

In [ ]:
fig = px.histogram(accident_safety_data_new, x="Local",title='Distribution by local cities')
fig.show()
In [ ]:
fig = px.histogram(accident_safety_data_new, x="Potential Accident Level",title='Distribution by potential accident level')
fig.show()

We can see that most of the people have met with accident having level 1.

In [ ]:
fig = px.histogram(accident_safety_data_new, x="Critical Risk",title='Distribution by critical risk')
fig.show()

We can see from the graph that the Critical risk category "Others" have the most number of accidents. This means we are not clear about the exact risk factor associated with accidents in this dataset.

BIVARIATE ANALYSIS

Let us write a function to see how does the accident level varies with the Industry Sector and Countries

In [ ]:
def plothistograms(data,column_name_x,column_name_y,value,title):
    fig = px.histogram(data, 
                       x=column_name_x, 
                       color=column_name_y,
                       barmode=value,
                       title=title
                       )
    fig.show()
  1. We will see the distribution of different accident levels occured per country
In [ ]:
plothistograms(accident_safety_data_new,"Countries","Accident Level","relative",'Distribution of various accident levels per country.')

Observations from the above graph:

  1. Accident Level V accidents have occured only in Country I.
  2. Maximum number of accidents in all countries are mainly of type Accident Level I.
  3. Country_01 has had accidents of all Accident types, making it the most riskiest place as per the dataset.
In [ ]:
plothistograms(accident_safety_data_new,"Industry Sector","Accident Level","group",'Distribution of various accident levels per industry sector.')

The most number of accidents have occured in the Mining Industry in Country 1 so far, followed by the metal industry, also in Country 1.

  1. Next, let us see how many accidents have occured per Local
In [ ]:
plothistograms(accident_safety_data_new,"Local","Industry Sector","stack",'Distribution of various industry sectors per local city.')
plothistograms(accident_safety_data_new,"Countries","Industry Sector","stack","Distribution of various accident levels per country.")
plothistograms(accident_safety_data_new,"Industry Sector","Accident Level","stack",'Distribution of various accident levels per industry sector.')
  1. Local 01,Local 02,Local 03,Local 04,Local 07 all have plants belonging to the Mining Sector and they have had the most number of accidents.
  2. Industry sectors "Other" have had the least number of accidents.
  3. Local 09 and Local_11 seems to be the safest cities, with only 2 accidents, even though it has plants belonging to the Metal sector.
In [ ]:
accident_safety_data_new['Critical Risk'].value_counts()
Out[ ]:
Others                                       229
Pressed                                       24
Manual Tools                                  20
Chemical substances                           17
Cut                                           14
Projection                                    13
Venomous Animals                              13
Bees                                          10
Fall                                           9
Vehicles and Mobile Equipment                  8
remains of choco                               7
Pressurized Systems                            7
Fall prevention (same level)                   7
Fall prevention                                6
Suspended Loads                                6
Liquid Metal                                   3
Power lock                                     3
Pressurized Systems / Chemical Substances      3
Blocking and isolation of energies             3
Machine Protection                             2
Electrical Shock                               2
Poll                                           1
Plates                                         1
Projection of fragments                        1
Confined space                                 1
Projection/Choco                               1
\nNot applicable                               1
Individual protection equipment                1
Projection/Burning                             1
Electrical installation                        1
Traffic                                        1
Burn                                           1
Projection/Manual Tools                        1
Name: Critical Risk, dtype: int64
In [ ]:
df = accident_safety_data_new.copy(True)
lb_make = LabelEncoder()
df['Accident_Level_labelencoded'] = lb_make.fit_transform(df['Accident Level'])
df['Accident_Level_labelencoded']=df['Accident_Level_labelencoded']+1
fig = px.scatter(df, x="Critical Risk", y="Accident Level", color="Industry Sector",
                 size='Accident_Level_labelencoded', hover_data=['Industry Sector']
                 ,title="Spread of Accident Levels by Critical risk category per Industry sector")
fig.show()

From the above graph we can see the following:

  1. There are numerous risks involved in the Metals sector, followed by the ones in the Mining sector.
  2. Comparitively very low risks are there in the "Other" industry sector.
In [ ]:
df = accident_safety_data_new.copy(True)
lb_make = LabelEncoder()
df['Accident_Level_labelencoded'] = lb_make.fit_transform(df['Accident Level'])
df['Accident_Level_labelencoded']=df['Accident_Level_labelencoded']+1
fig = px.scatter(df, x="Critical Risk", y="Accident Level", color="Employee Type",
                 size='Accident_Level_labelencoded', hover_data=['Critical Risk']
                 ,title="Spread of Accident Levels by Critical Risk category per Employee Type"
                 ,width=1200,height=600)
fig.show()

From the above graph we can make the below observations:

  1. Mostly third party contractors(both on site and remote) have had accidents of notably all Accident Levels in the "Others" risk category.

  2. "Pressed" risks are the second most dangerous ones where employees and contractors both have had accidents.

In [ ]:
plothistograms(accident_safety_data_new,"Potential Accident Level","Industry Sector","stack",'Distribution of Potential Accident Level by Industry Sector')
  1. Major number of accidents have occured in the Potential Accident Level 3 category.
  2. Potential Accident Level 5 is least in the mining industry.

NLP analysis

Let us see the most frequent words used for each accident level now.

  1. Accident Level I
In [ ]:
from wordcloud import WordCloud
keywords = " ".join(line for line in accident_safety_data_new[accident_safety_data_new['Accident Level']=='I'].Description)
word_cloud= WordCloud(width=1250, height=625, max_font_size=350, 
                      random_state=42).generate(keywords)
plt.figure(figsize=(20, 10))
plt.title("Most frequent words used to describe Accident Level I", size=20, weight="bold")
plt.imshow(word_cloud)
plt.axis("off")
plt.show()
  1. Accident Level II
In [ ]:
from wordcloud import WordCloud
keywords = " ".join(line for line in accident_safety_data_new[accident_safety_data_new['Accident Level']=='II'].Description)
word_cloud= WordCloud(width=1250, height=625, max_font_size=350, 
                      random_state=42).generate(keywords)
plt.figure(figsize=(20, 10))
plt.title("Most frequent words used to describe Accident Level II", size=20, weight="bold")
plt.imshow(word_cloud)
plt.axis("off")
plt.show()
  1. Accident Level III
In [ ]:
from wordcloud import WordCloud
keywords = " ".join(line for line in accident_safety_data_new[accident_safety_data_new['Accident Level']=='III'].Description)
word_cloud= WordCloud(width=1250, height=625, max_font_size=350, 
                      random_state=42).generate(keywords)
plt.figure(figsize=(20, 10))
plt.title("Most frequent words used to describe Accident Level III", size=20, weight="bold")
plt.imshow(word_cloud)
plt.axis("off")
plt.show()
  1. Accident Level IV
In [ ]:
from wordcloud import WordCloud
keywords = " ".join(line for line in accident_safety_data_new[accident_safety_data_new['Accident Level']=='IV'].Description)
word_cloud= WordCloud(width=1250, height=625, max_font_size=350, 
                      random_state=42).generate(keywords)
plt.figure(figsize=(20, 10))
plt.title("Most frequent words used to describe Accident Level IV", size=20, weight="bold")
plt.imshow(word_cloud)
plt.axis("off")
plt.show()

Accident Level V

In [ ]:
from wordcloud import WordCloud
keywords = " ".join(line for line in accident_safety_data_new[accident_safety_data_new['Accident Level']=='V'].Description)
word_cloud= WordCloud(width=1250, height=625, max_font_size=350, 
                      random_state=42).generate(keywords)
plt.figure(figsize=(20, 10))
plt.title("Most frequent words used to describe Accident Level V", size=20, weight="bold")
plt.imshow(word_cloud)
plt.axis("off")
plt.show()

DATA AUGMENTATION

In [ ]:
#Let us first create a dataset using only the class variable "Accident Level" and Description column.
accident_safety_data_trimmed=accident_safety_data_new.copy(True)
accident_safety_data_trimmed['Accident_Level']=accident_safety_data_trimmed.apply(lambda col: str(col['Accident Level']), axis=1)
accident_safety_data_trimmed['Description']=accident_safety_data_trimmed.apply(lambda col: str(col['Description']), axis=1)
accident_safety_data_trimmed=accident_safety_data_trimmed[['Accident_Level','Description']]
accident_safety_data_trimmed.head()
Out[ ]:
Accident_Level Description
0 I While removing the drill rod of the Jumbo 08 f...
1 I During the activation of a sodium sulphide pum...
2 I In the sub-station MILPO located at level +170...
3 I Being 9:45 am. approximately in the Nv. 1880 C...
4 IV Approximately at 11:45 a.m. in circumstances t...
In [ ]:
labels, frequencies = np.unique(accident_safety_data_trimmed.Accident_Level.values, return_counts=True)

fig = px.pie(accident_safety_data_trimmed, values=frequencies, names=labels, title='Frequency of Description by Accident Level')
fig.show()

We can clearly see that the Description column is imbalanced in the dataset. Most of the description is present only for Accident Level I(0).

We will now check the exact counts of Descriptions per Accident level.

In [ ]:
accident_safety_data_trimmed.Accident_Level.value_counts().values
for u in accident_safety_data_trimmed.Accident_Level.unique().tolist():
    print(u)
I
IV
III
II
V

Let us first divide our data into train and test samples

We will try augmentation techniques so that the data is balanced properly before it is passed into the dataset.

EDA

Using EDA let us perform data augmentation

Let us divide data of each Accident Level in different dataframes

In [ ]:
options=['I']
df_0=accident_safety_data_trimmed.copy(True)
df_0=df_0.loc[df_0['Accident_Level'].isin(options)]
df_0.shape
Out[ ]:
(309, 2)
In [ ]:
options=['II']
df_1=accident_safety_data_trimmed.copy(True)
df_1=df_1.loc[df_1['Accident_Level'].isin(options)]
df_1.shape
Out[ ]:
(40, 2)
In [ ]:
options=['III']
df_2=accident_safety_data_trimmed.copy(True)
df_2=df_2.loc[df_2['Accident_Level'].isin(options)]
df_2.shape
Out[ ]:
(31, 2)
In [ ]:
options=['IV']
df_3=accident_safety_data_trimmed.copy(True)
df_3=df_3.loc[df_3['Accident_Level'].isin(options)]
df_3.shape
Out[ ]:
(30, 2)
In [ ]:
options=['V']
df_4=accident_safety_data_trimmed.copy(True)
df_4=df_4.loc[df_4['Accident_Level'].isin(options)]
df_4.shape
Out[ ]:
(8, 2)

Now, we will augment each dataset seperately. Here the gen_eda function from data_augmentation.py takes in the below parameters:

dataset - dataframe name alpha_sr - percentage of words in the dataset we want to replace with synonyms.

alpha_ri - percentage of words in the dataset we want to randomly insert.

alpha_rs - percentage of words in the dataset we want to randomly swap.

alpha_rd - percentage of words in the dataset we want to randomly delete.

num_aug - total number of augmented sentences we want per sentence in the dataset.

In [ ]:
df_0_up=gen_eda(df_0,0.7,0.1,0.2,0.15,2)
df_1_up=gen_eda(df_1,0.7,0.2,0.2,0.1,22)
df_2_up=gen_eda(df_2,0.7,0.2,0.2,0.1,29)
df_3_up=gen_eda(df_3,0.7,0.2,0.2,0.1,30)
df_4_up=gen_eda(df_4,0.7,0.2,0.2,0.1,114)
In [ ]:
accident_safety_data_upsampled = pd.concat([df_0_up,df_1_up,df_2_up,df_3_up,df_4_up])
In [ ]:
accident_safety_data_upsampled.describe().T
Out[ ]:
count unique top freq
Accident_Level 4627 5 III 930
Description 4627 4617 In the activity of loading of explosives in fr... 2
In [ ]:
accident_safety_data_upsampled.shape
Out[ ]:
(4627, 2)
In [ ]:
labels, frequencies = np.unique(accident_safety_data_upsampled.Accident_Level.values, return_counts=True)

fig = px.pie(accident_safety_data_upsampled, values=frequencies, names=labels, title='Frequency of Description by Accident Level')
fig.show()
In [ ]:
  accident_safety_data_upsampled["Description_DL"] = accident_safety_data_upsampled["Description"].apply(lambda x: clean_DL_data1(x))
In [ ]:
accident_safety_data_upsampled["Description_ML"] = accident_safety_data_upsampled["Description"].apply(lambda x: clean_data(x))
In [ ]:
accident_safety_data_upsampled.head(10)
Out[ ]:
Accident_Level Description Description_DL Description_ML
0 I piece get rid of the mandrillus leucophaeus re... piece get rid of the mandrillus leucophaeus re... piec get rid mandrillu leucophaeu retin rod ga...
1 I While removing the drill rod of the Jumbo 08 f... While removing the drill rod of the Jumbo for... remov drill rod jumbo mainten radio beam super...
2 I While removing the drill rod of the Jumbo 08 f... While removing the drill rod of the Jumbo for... remov drill rod jumbo mainten supervisor proce...
3 I During the energizing of a atomic number sulp... During the energizing of a atomic number sulp... energ atom number sulphid pump pipe decoupl su...
4 I During the activation of deoxyadenosine monoph... During the activation of deoxyadenosine monoph... activ deoxyadenosin monophosph sodium sulphid ...
5 I During the activation of a sodium sulphide pum... During the activation of a sodium sulphide pum... activ sodium sulphid pump pipe uncoupl sulfid ...
6 I atomic number the sub-station MILPO turn up a... atomic number the substation MILPO turn up at... atom number substat milpo turn rase partner di...
7 I In the sub-station MILPO located at level conf... In the substation MILPO located at level confe... substat milpo locat level confeder vapid colla...
8 I In the sub-station MILPO located at level +170... In the substation MILPO located at level when... substat milpo locat level collabor excav work ...
9 I be 9:45 am. or so in the Nv. 1880 CX-695 OB7, ... be 9 am or so in the Nv CX OB7 the personnel ... personnel depart start chore unlock soquet bol...
In [ ]:
print(accident_safety_data_upsampled.Description[1])
print(accident_safety_data_upsampled.Description_DL[1])
print(accident_safety_data_upsampled.Description_ML[1])
1    While removing the drill rod of the Jumbo 08 f...
1    while align the redress square bracket of pred...
1    atomic number  Rp 050 of flat 1620, in fate wh...
1    close to at 11:45 am in fate that the mechanic...
1    close to 1:40 pm in destiny that shotcrete was...
Name: Description, dtype: object
1    While removing the drill rod of the Jumbo  for...
1    while align the redress square bracket of pred...
1    atomic number  Rp  of flat  in fate where the ...
1    close to at  am in fate that the mechanic susa...
1    close to 1 pm in destiny that shotcrete was es...
Name: Description_DL, dtype: object
1    remov drill rod jumbo mainten radio beam super...
1    align redress squar bracket predomin normal re...
1    atom number flat fate proletarian ship compani...
1    close fate mechan susan anthoni group leader e...
1    close destini shotcret establish complet launc...
Name: Description_ML, dtype: object

Named entity recognition

In [ ]:
accident_safety_data_upsampled.to_csv('upsampled_accident_safety_data.csv',index=False,encoding='utf-8')
In [ ]:
accident_safety_upsampled_pos=pd.read_csv('upsampled_accident_safety_data.csv')
In [ ]:
accident_safety_upsampled_pos['POSTags'] = pos_tag_sents(accident_safety_upsampled_pos['Description_DL'].apply(word_tokenize).tolist())
In [ ]:
accident_safety_upsampled_pos.POSTags[:1]
Out[ ]:
0    [(piece, NN), (get, VB), (rid, JJ), (of, IN), ...
Name: POSTags, dtype: object

Feature extraction. We will try the below vectorizers

  1. Count Vectorizer
  2. TF IDF vectorizer
  3. WordVec
  4. GlOve

We will first work with data cleaned for machine learning and then data cleaned for deep learning.

Using Count Vectorizer

In [ ]:
#unigrams
count_train_cv_ML_1,features_cv_ML_1 = count_vectorizer_features(accident_safety_upsampled_pos.Description_ML,1)
count_train_cv_DL_1,features_cv_DL_1 = count_vectorizer_features(accident_safety_upsampled_pos.Description_DL,1)
count_train_cv_1,features_cv_1 = count_vectorizer_features(accident_safety_upsampled_pos.Description,1)
x_orig_cv_1=pd.DataFrame(count_train_cv_1,columns=list(features_cv_1))
x_DL_cv_1=pd.DataFrame(count_train_cv_DL_1,columns=list(features_cv_DL_1))
x_ML_cv_1=pd.DataFrame(count_train_cv_ML_1,columns=list(features_cv_ML_1))

#unigrams and bigrams
count_train_cv_ML_2,features_cv_ML_2 = count_vectorizer_features(accident_safety_upsampled_pos.Description_ML,2)
count_train_cv_DL_2,features_cv_DL_2 = count_vectorizer_features(accident_safety_upsampled_pos.Description_DL,2)
count_train_cv_2,features_cv_2 = count_vectorizer_features(accident_safety_upsampled_pos.Description,2)
x_orig_cv_2=pd.DataFrame(count_train_cv_2,columns=list(features_cv_2))
x_DL_cv_2=pd.DataFrame(count_train_cv_DL_2,columns=list(features_cv_DL_2))
x_ML_cv_2=pd.DataFrame(count_train_cv_ML_2,columns=list(features_cv_ML_2))

#unigrams, bigrams and trigrams
# count_train_cv_ML_3,features_cv_ML_3 = count_vectorizer_features(accident_safety_upsampled_pos.Description_ML,3)
# count_train_cv_DL_3,features_cv_DL_3 = count_vectorizer_features(accident_safety_upsampled_pos.Description_DL,3)
# count_train_cv_3,features_cv_3 = count_vectorizer_features(accident_safety_upsampled_pos.Description,3)
# x_orig_cv_3=pd.DataFrame(count_train_cv_3,columns=list(features_cv_3))
# x_DL_cv_3=pd.DataFrame(count_train_cv_DL_3,columns=list(features_cv_DL_3))
# x_ML_cv_3=pd.DataFrame(count_train_cv_ML_3,columns=list(features_cv_ML_3))
In [ ]:
#Let us see the shape of the dataset

#Deep learning dataset
print('Number of unigram features generated in the deep learning dataset:',x_DL_cv_1.shape)
print('Number of bigrams features generated in the deep learning dataset:',x_DL_cv_2.shape)
# print('Number of trigrams features generated in the deep learning dataset:',x_DL_cv_3.shape)

#Machine learning dataset
print('Number of unigram features generated in the machine learning dataset:',x_ML_cv_1.shape)
print('Number of bigrams features generated in the machine learning dataset:',x_ML_cv_2.shape)
# print('Number of trigrams features generated in the machine learning dataset:',x_ML_cv_3.shape)
Number of unigram features generated in the deep learning dataset: (4627, 7884)
Number of bigrams features generated in the deep learning dataset: (4627, 89499)
Number of unigram features generated in the machine learning dataset: (4627, 5944)
Number of bigrams features generated in the machine learning dataset: (4627, 83850)

Since the number of features generated are very large in number, we will suffer from curse of dimensionality, hence reducing the number of features to 750

In [ ]:
#unigrams
count_train_cv_ML_1,features_cv_ML_1 = count_vectorizer_features(accident_safety_upsampled_pos.Description_ML,1,750)
count_train_cv_DL_1,features_cv_DL_1 = count_vectorizer_features(accident_safety_upsampled_pos.Description_DL,1,750)
count_train_cv_1,features_cv_1 = count_vectorizer_features(accident_safety_upsampled_pos.Description,1,750)
x_orig_cv_1=pd.DataFrame(count_train_cv_1,columns=list(features_cv_1))
x_DL_cv_1=pd.DataFrame(count_train_cv_DL_1,columns=list(features_cv_DL_1))
x_ML_cv_1=pd.DataFrame(count_train_cv_ML_1,columns=list(features_cv_ML_1))

#unigrams and bigrams
count_train_cv_ML_2,features_cv_ML_2 = count_vectorizer_features(accident_safety_upsampled_pos.Description_ML,2,750)
count_train_cv_DL_2,features_cv_DL_2 = count_vectorizer_features(accident_safety_upsampled_pos.Description_DL,2,750)
count_train_cv_2,features_cv_2 = count_vectorizer_features(accident_safety_upsampled_pos.Description,2,750)
x_orig_cv_2=pd.DataFrame(count_train_cv_2,columns=list(features_cv_2))
x_DL_cv_2=pd.DataFrame(count_train_cv_DL_2,columns=list(features_cv_DL_2))
x_ML_cv_2=pd.DataFrame(count_train_cv_ML_2,columns=list(features_cv_ML_2))

# #unigrams,bigrams and trigrams
# count_train_cv_ML_3,features_cv_ML_3 = count_vectorizer_features(accident_safety_upsampled_pos.Description_ML,3,750)
# count_train_cv_DL_3,features_cv_DL_3 = count_vectorizer_features(accident_safety_upsampled_pos.Description_DL,3,750)
# count_train_cv_3,features_cv_3 = count_vectorizer_features(accident_safety_upsampled_pos.Description,3,750)
# x_orig_cv_3=pd.DataFrame(count_train_cv_3,columns=list(features_cv_3))
# x_DL_cv_3=pd.DataFrame(count_train_cv_DL_3,columns=list(features_cv_DL_3))
# x_ML_cv_3=pd.DataFrame(count_train_cv_ML_3,columns=list(features_cv_ML_3))

Deep learning features

In [ ]:
# #print unigram features
print(count_train_cv_DL_1.shape)
print(features_cv_DL_1[:50])

# #print bigram features
print(count_train_cv_DL_2.shape)
print(features_cv_DL_2[:50])

# # #print trigram features
# # print(count_train_cv_DL_3.shape)
# # print(features_cv_DL_3[:50])
(4627, 750)
['14x07x02', '15', '3m', '4th', '56', 'about', 'access', 'accident', 'accumulating', 'acid', 'action', 'activated', 'activities', 'activity', 'adapted', 'adhesion', 'adjoining', 'after', 'against', 'air', 'albertico', 'albino', 'all', 'allow', 'alone', 'along', 'alpha', 'am', 'ampoloader', 'amputation', 'an', 'and', 'anfo', 'anfoloader', 'anode', 'another', 'answer', 'any', 'approaching', 'approx', 'approximate', 'approximately', 'are', 'area', 'arm', 'around', 'as', 'ask', 'asks', 'assigned']
(4627, 750)
['14x07x02', '3m', '56', 'about', 'access', 'accident', 'accident the', 'acid', 'action', 'activated', 'activities', 'activity', 'activity of', 'after', 'after having', 'against', 'against the', 'air', 'albertico', 'all', 'allow', 'alone', 'along', 'alpha', 'am', 'ampoloader', 'amputation', 'an', 'and', 'and at', 'and his', 'and hits', 'and in', 'and the', 'and when', 'anfo', 'another', 'any', 'approaching', 'approx', 'approximately', 'approximately in', 'approximately pm', 'approximately the', 'are', 'area', 'arm', 'around', 'as', 'ask']

Machine learning features

In [ ]:
#print unigram features
print(count_train_cv_ML_1.shape)
print(features_cv_ML_1[:50])

#print bigram features
print(count_train_cv_ML_2.shape)
print(features_cv_ML_2[:50])

# #print trigram features
# # print(count_train_cv_ML_3.shape)
# # print(features_cv_ML_3[:50])
(4627, 750)
['abl', 'access', 'accid', 'accident', 'accompani', 'accumul', 'acid', 'action', 'activ', 'adapt', 'addit', 'adhes', 'adjoin', 'adjunct', 'adjust', 'administr', 'advanc', 'affect', 'afterward', 'aid', 'air', 'albertico', 'albino', 'allow', 'alon', 'along', 'alpha', 'ampoload', 'amput', 'anchor', 'anfo', 'anfoload', 'angl', 'anod', 'anoth', 'answer', 'appar', 'appli', 'approach', 'approx', 'approxim', 'area', 'arm', 'around', 'arrang', 'arriv', 'ascend', 'ask', 'assembl', 'assign']
(4627, 750)
['access', 'accid', 'accid employe', 'accident', 'accompani', 'accumul', 'acid', 'action', 'activ', 'activ trap', 'activ verifi', 'adapt', 'adhes', 'adjoin', 'adjoin cell', 'adjust', 'affect', 'afterward', 'aid', 'air', 'albertico', 'albino', 'allow', 'alon', 'alon cut', 'along', 'alpha', 'alpha albertico', 'ampoload', 'amput', 'anchor', 'anfo', 'anfoload', 'anod', 'anoth', 'answer', 'appli', 'approach', 'approx', 'approxim', 'approxim circumst', 'approxim oper', 'approxim tecnomin', 'area', 'arm', 'around', 'arrang', 'arriv', 'ask', 'ask oper']
In [ ]:
x_ML_cv_2.head()
Out[ ]:
access accid accid employe accident accompani accumul acid action activ activ trap activ verifi adapt adhes adjoin adjoin cell adjust affect afterward aid air albertico albino allow alon alon cut along alpha alpha albertico ampoload amput anchor anfo anfoload anod anoth answer appli approach approx approxim ... twenti two type unit unload unlock untim upon upper upper edg use use rubber utilis valv vehicl verifi victim visual wall water way wear weight went wheeler wheeler dealer william winch wineri wineri chagua wire withdraw without wood wooden work work area worker wound zone
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

5 rows × 750 columns

  1. Now we will use TF IDF Vectorizer also.
In [ ]:
#unigrams
count_train_tfidf_ML_1,features_tfidf_ML_1 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_ML,1)
count_train_tfidf_DL_1,features_tfidf_DL_1 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_DL,1)
count_train_tfidf_1,features_tfidf_1 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_DL,1)
x_orig_tfidf_1=pd.DataFrame(count_train_tfidf_1,columns=list(features_tfidf_1))
x_DL_tfidf_1=pd.DataFrame(count_train_tfidf_DL_1,columns=list(features_tfidf_DL_1))
x_ML_tfidf_1=pd.DataFrame(count_train_tfidf_ML_1,columns=list(features_tfidf_ML_1))

#bigrams
count_train_tfidf_ML_2,features_tfidf_ML_2 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_ML,2)
count_train_tfidf_DL_2,features_tfidf_DL_2 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_DL,2)
count_train_tfidf_2,features_tfidf_2 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_DL,2)
x_orig_tfidf_2=pd.DataFrame(count_train_tfidf_2,columns=list(features_tfidf_2))
x_DL_tfidf_2=pd.DataFrame(count_train_tfidf_DL_2,columns=list(features_tfidf_DL_2))
x_ML_tfidf_2=pd.DataFrame(count_train_tfidf_ML_2,columns=list(features_tfidf_ML_2))

#trigrams
# count_train_tfidf_ML_3,features_tfidf_ML_3 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_ML,3)
# count_train_tfidf_DL_3,features_tfidf_DL_3 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_DL,3)
# count_train_tfidf_3,features_tfidf_3 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_DL,3)
# x_orig_tfidf_3=pd.DataFrame(count_train_tfidf_3,columns=list(features_tfidf_3))
# x_DL_tfidf_3=pd.DataFrame(count_train_tfidf_DL_3,columns=list(features_tfidf_DL_3))
# x_ML_tfidf_3=pd.DataFrame(count_train_tfidf_ML_3,columns=list(features_tfidf_ML_3))
In [ ]:
#Let us see the shape of the dataset

#Deep learning dataset
print('Number of unigram features generated in the deep learning dataset:',x_DL_tfidf_1.shape)
print('Number of bigrams features generated in the deep learning dataset:',x_DL_tfidf_2.shape)
# print('Number of trigrams features generated in the deep learning dataset:',x_DL_tfidf_3.shape)

#Machine learning dataset
print('Number of unigram features generated in the machine learning dataset:',x_ML_tfidf_1.shape)
print('Number of bigrams features generated in the machine learning dataset:',x_ML_tfidf_2.shape)
# print('Number of trigrams features generated in the machine learning dataset:',x_ML_tfidf_3.shape)
Number of unigram features generated in the deep learning dataset: (4627, 7884)
Number of bigrams features generated in the deep learning dataset: (4627, 89499)
Number of unigram features generated in the machine learning dataset: (4627, 5944)
Number of bigrams features generated in the machine learning dataset: (4627, 83850)
In [ ]:
#Reducing the features to 750
#unigrams
count_train_tfidf_ML_1,features_tfidf_ML_1 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_ML,1,750)
count_train_tfidf_DL_1,features_tfidf_DL_1 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_DL,1,750)
count_train_tfidf_1,features_tfidf_1 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_DL,1,750)
x_orig_tfidf_1=pd.DataFrame(count_train_tfidf_1,columns=list(features_tfidf_1))
x_DL_tfidf_1=pd.DataFrame(count_train_tfidf_DL_1,columns=list(features_tfidf_DL_1))
x_ML_tfidf_1=pd.DataFrame(count_train_tfidf_ML_1,columns=list(features_tfidf_ML_1))

#bigrams
count_train_tfidf_ML_2,features_tfidf_ML_2 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_ML,2,750)
count_train_tfidf_DL_2,features_tfidf_DL_2 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_DL,2,750)
count_train_tfidf_2,features_tfidf_2 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_DL,2,750)
x_orig_tfidf_2=pd.DataFrame(count_train_tfidf_2,columns=list(features_tfidf_2))
x_DL_tfidf_2=pd.DataFrame(count_train_tfidf_DL_2,columns=list(features_tfidf_DL_2))
x_ML_tfidf_2=pd.DataFrame(count_train_tfidf_ML_2,columns=list(features_tfidf_ML_2))

#trigrams
# count_train_tfidf_ML_3,features_tfidf_ML_3 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_ML,3,750)
# count_train_tfidf_DL_3,features_tfidf_DL_3 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_DL,3,750)
# count_train_tfidf_3,features_tfidf_3 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_DL,3,750)
# x_orig_tfidf_3=pd.DataFrame(count_train_tfidf_3,columns=list(features_tfidf_3))
# x_DL_tfidf_3=pd.DataFrame(count_train_tfidf_DL_3,columns=list(features_tfidf_DL_3))
# x_ML_tfidf_3=pd.DataFrame(count_train_tfidf_ML_3,columns=list(features_tfidf_ML_3))

Deep learning features for TF IDF vectorizer

Machine learning features for TF IDF vectorizer

In [ ]:
#print unigram features
print(count_train_tfidf_ML_1.shape)
print(features_tfidf_ML_1[:50])

#print bigram features
print(count_train_tfidf_ML_2.shape)
print(features_tfidf_ML_2[:50])

#print trigram features
# print(count_train_tfidf_ML_3.shape)
# print(features_tfidf_ML_3[:50])
(4627, 750)
['abl', 'access', 'accid', 'accident', 'accompani', 'accumul', 'acid', 'action', 'activ', 'adapt', 'addit', 'adhes', 'adjoin', 'adjunct', 'adjust', 'administr', 'advanc', 'affect', 'afterward', 'aid', 'air', 'albertico', 'albino', 'allow', 'alon', 'along', 'alpha', 'ampoload', 'amput', 'anchor', 'anfo', 'anfoload', 'angl', 'anod', 'anoth', 'answer', 'appar', 'appli', 'approach', 'approx', 'approxim', 'area', 'arm', 'around', 'arrang', 'arriv', 'ascend', 'ask', 'assembl', 'assign']
(4627, 750)
['access', 'accid', 'accid employe', 'accident', 'accompani', 'accumul', 'acid', 'action', 'activ', 'activ trap', 'activ verifi', 'adapt', 'adhes', 'adjoin', 'adjoin cell', 'adjust', 'affect', 'afterward', 'aid', 'air', 'albertico', 'albino', 'allow', 'alon', 'alon cut', 'along', 'alpha', 'alpha albertico', 'ampoload', 'amput', 'anchor', 'anfo', 'anfoload', 'anod', 'anoth', 'answer', 'appli', 'approach', 'approx', 'approxim', 'approxim circumst', 'approxim oper', 'approxim tecnomin', 'area', 'arm', 'around', 'arrang', 'arriv', 'ask', 'ask oper']

Let us now one hot encode the class variable Accident Level

In [ ]:
lb_make = LabelEncoder()
accident_safety_upsampled_pos['Accident_Level'] = lb_make.fit_transform(accident_safety_upsampled_pos['Accident_Level'])
In [ ]:
y_DL = pd.get_dummies(accident_safety_upsampled_pos['Accident_Level']).values
y_ML = pd.get_dummies(accident_safety_upsampled_pos['Accident_Level']).values
y_orig = pd.get_dummies(accident_safety_upsampled_pos['Accident_Level']).values
In [ ]:
x_ML_cv_1 = x_ML_cv_1.join(accident_safety_upsampled_pos['Accident_Level'].reset_index(drop=True))
x_ML_cv_2 = x_ML_cv_2.join(accident_safety_upsampled_pos['Accident_Level'].reset_index(drop=True))
# x_ML_cv_3 = x_ML_cv_3.join(accident_safety_upsampled_pos['Accident_Level'].reset_index(drop=True))
x_DL_cv_1 = x_DL_cv_1.join(accident_safety_upsampled_pos['Accident_Level'].reset_index(drop=True))
x_DL_cv_2 = x_DL_cv_2.join(accident_safety_upsampled_pos['Accident_Level'].reset_index(drop=True))
# x_DL_cv_3 = x_DL_cv_3.join(accident_safety_upsampled_pos['Accident_Level'].reset_index(drop=True))
x_DL_tfidf_1 = x_DL_tfidf_1.join(accident_safety_upsampled_pos['Accident_Level'].reset_index(drop=True))
x_DL_tfidf_2 = x_DL_tfidf_2.join(accident_safety_upsampled_pos['Accident_Level'].reset_index(drop=True))
# x_DL_tfidf_3 = x_DL_tfidf_3.join(accident_safety_upsampled_pos['Accident_Level'].reset_index(drop=True))
x_ML_tfidf_1 = x_ML_tfidf_1.join(accident_safety_upsampled_pos['Accident_Level'].reset_index(drop=True))
x_ML_tfidf_2 = x_ML_tfidf_2.join(accident_safety_upsampled_pos['Accident_Level'].reset_index(drop=True))
# x_ML_tfidf_3 = x_ML_tfidf_3.join(accident_safety_upsampled_pos['Accident_Level'].reset_index(drop=True))

Now our data is ready, so we can divide the data into test and train.

Dataset to be used for deep learning

Count Vectorized data

Unigrams : x_DL_cv_1 , y_DL

Bigrams : x_DL_cv_2 , y_DL

Trigrams : x_DL_cv_3 , y_DL

TF IDF Vectorized data

Unigrams : x_DL_tfidf_1 , y_DL

Bigrams : x_DL_tfidf_2 , y_DL

Trigrams : x_DL_tfidf_3 , y_DL

Dataset to be used for machine learning models

Count Vectorized data

Unigrams : x_ML_cv_1 , y_ML

Bigrams : x_ML_cv_2 , y_ML

Trigrams : x_ML_cv_3 , y_ML

TF IDF Vectorized data

Unigrams : x_ML_tfidf_1 , y_ML

Bigrams : x_ML_tfidf_2 , y_ML

Trigrams : x_ML_tfidf_3 , y_ML

Let us now input this data into machine learning models

Step 1

  1. Split the data into 80 and 20
  2. Using TF IDF vectorized data
  3. Unigrams
In [ ]:
X=x_ML_tfidf_1.drop(['Accident_Level'],axis=1)
Y=x_ML_tfidf_1.Accident_Level
In [ ]:
X_train, X_test, y_train, y_test = train_test_split(X,  Y, test_size = 0.20, random_state = 1, stratify = y_ML)
In [ ]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
(3701, 750)
(926, 750)
(3701,)
(926,)
In [ ]:
# For multiclass problems, only 'newton-cg', 'sag', 'saga' and 'lbfgs' handle multinomial loss; 'liblinear' is limited to one-versus-rest schemes.

resultsDf = pd.DataFrame()

# Building a Linear Regression model
lr = LogisticRegression(solver='lbfgs', multi_class='multinomial', random_state = 1)
                                                     
# Train and Test the model
resultsDf = train_test_model(lr, 'Logistic Regression', X_train, X_test, y_train, y_test, 'none', 1, 'no', 'yes', 'no')

# Store the accuracy results for each model in a dataframe for final comparison
resultsDf
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
***************************************************************************
              precision    recall  f1-score   support

           0       0.89      0.84      0.87       186
           1       0.90      0.86      0.88       184
           2       0.93      0.97      0.95       186
           3       0.92      0.98      0.95       186
           4       1.00      0.99      1.00       184

    accuracy                           0.93       926
   macro avg       0.93      0.93      0.93       926
weighted avg       0.93      0.93      0.93       926

Out[ ]:
Method Train Accuracy Test Accuracy Precision Recall F1-Score Multi-Class Logloss
1 Logistic Regression 0.975142 0.929806 0.929247 0.929806 0.929065 0.364928
In [ ]:
train_test_allmodels(X_train, X_test, y_train, y_test, 'no','yes','no')
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
***************************************************************************
              precision    recall  f1-score   support

           0       0.89      0.84      0.87       186
           1       0.90      0.86      0.88       184
           2       0.93      0.97      0.95       186
           3       0.92      0.98      0.95       186
           4       1.00      0.99      1.00       184

    accuracy                           0.93       926
   macro avg       0.93      0.93      0.93       926
weighted avg       0.93      0.93      0.93       926

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')
***************************************************************************
              precision    recall  f1-score   support

           0       0.97      0.84      0.90       186
           1       0.90      0.96      0.93       184
           2       0.95      0.99      0.97       186
           3       0.95      0.96      0.95       186
           4       0.97      1.00      0.99       184

    accuracy                           0.95       926
   macro avg       0.95      0.95      0.95       926
weighted avg       0.95      0.95      0.95       926

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)
***************************************************************************
              precision    recall  f1-score   support

           0       0.88      0.96      0.92       186
           1       0.99      0.88      0.93       184
           2       0.98      0.97      0.98       186
           3       0.95      0.97      0.96       186
           4       1.00      0.99      1.00       184

    accuracy                           0.96       926
   macro avg       0.96      0.96      0.96       926
weighted avg       0.96      0.96      0.96       926

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=1, splitter='best')
***************************************************************************
              precision    recall  f1-score   support

           0       0.81      0.88      0.84       186
           1       0.88      0.84      0.86       184
           2       0.91      0.91      0.91       186
           3       0.91      0.87      0.89       186
           4       0.99      0.99      0.99       184

    accuracy                           0.90       926
   macro avg       0.90      0.90      0.90       926
weighted avg       0.90      0.90      0.90       926

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)
***************************************************************************
              precision    recall  f1-score   support

           0       0.87      0.93      0.90       186
           1       0.95      0.91      0.93       184
           2       0.97      0.97      0.97       186
           3       0.97      0.94      0.95       186
           4       1.00      0.99      1.00       184

    accuracy                           0.95       926
   macro avg       0.95      0.95      0.95       926
weighted avg       0.95      0.95      0.95       926

BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                  max_features=1.0, max_samples=0.75, n_estimators=30,
                  n_jobs=None, oob_score=True, random_state=1, verbose=0,
                  warm_start=False)
***************************************************************************
              precision    recall  f1-score   support

           0       0.89      0.91      0.90       186
           1       0.93      0.88      0.90       184
           2       0.93      0.96      0.94       186
           3       0.95      0.96      0.95       186
           4       0.99      1.00      1.00       184

    accuracy                           0.94       926
   macro avg       0.94      0.94      0.94       926
weighted avg       0.94      0.94      0.94       926

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.25,
                   n_estimators=100, random_state=1)
***************************************************************************
              precision    recall  f1-score   support

           0       0.27      0.64      0.38       186
           1       0.52      0.50      0.51       184
           2       0.71      0.48      0.57       186
           3       0.65      0.34      0.45       186
           4       1.00      0.49      0.66       184

    accuracy                           0.49       926
   macro avg       0.63      0.49      0.51       926
weighted avg       0.63      0.49      0.51       926

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=50,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=1, subsample=1.0, tol=0.0001,
                           validation_fraction=0.2, verbose=0,
                           warm_start=False)
***************************************************************************
              precision    recall  f1-score   support

           0       0.75      0.80      0.77       186
           1       0.87      0.83      0.85       184
           2       0.93      0.95      0.94       186
           3       0.91      0.91      0.91       186
           4       1.00      0.98      0.99       184

    accuracy                           0.89       926
   macro avg       0.89      0.89      0.89       926
weighted avg       0.89      0.89      0.89       926

<catboost.core.CatBoostClassifier object at 0x7fe6e3978b10>
***************************************************************************
              precision    recall  f1-score   support

           0       0.90      0.93      0.91       186
           1       0.96      0.89      0.92       184
           2       0.97      0.98      0.98       186
           3       0.94      0.96      0.95       186
           4       1.00      0.99      1.00       184

    accuracy                           0.95       926
   macro avg       0.95      0.95      0.95       926
weighted avg       0.95      0.95      0.95       926

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0.4,
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=7, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softmax', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
***************************************************************************
              precision    recall  f1-score   support

           0       0.84      0.90      0.87       186
           1       0.94      0.87      0.90       184
           2       0.97      0.96      0.96       186
           3       0.94      0.95      0.94       186
           4       1.00      1.00      1.00       184

    accuracy                           0.94       926
   macro avg       0.94      0.94      0.94       926
weighted avg       0.94      0.94      0.94       926

Out[ ]:
Method Train Accuracy Test Accuracy Precision Recall F1-Score Multi-Class Logloss
1 LogisticRegression 0.975142 0.929806 0.929247 0.929806 0.929065 0.364928
2 KNeighborsClassifier 0.986490 0.949244 0.950358 0.949244 0.948489 0.564490
3 SVC 0.996217 0.955724 0.958046 0.955724 0.955854 0.148082
4 DecisionTreeClassifier 0.998649 0.898488 0.900153 0.898488 0.898824 3.362077
5 RandomForestClassifier 0.997028 0.949244 0.950492 0.949244 0.949533 0.367133
6 BaggingClassifier 0.997298 0.939525 0.939604 0.939525 0.939337 0.402840
7 AdaBoostClassifier 0.513375 0.489201 0.628692 0.489201 0.512542 1.309672
8 GradientBoostingClassifier 0.936233 0.890929 0.892741 0.890929 0.891539 0.548589
9 CatBoostClassifier 0.997298 0.952484 0.953045 0.952484 0.952420 0.245901
10 XGBClassifier 0.979735 0.935205 0.936802 0.935205 0.935481 0.287221

Let us now perform the same steps with

Step 1

  1. Split the data into 80 and 20
  2. Using TF IDF vectorized data
  3. Bi grams
In [ ]:
#unigrams and bigrams
X=x_ML_tfidf_2.drop(['Accident_Level'],axis=1)
Y=x_ML_tfidf_2.Accident_Level
X_train, X_test, y_train, y_test = train_test_split(X,  Y, test_size = 0.20, random_state = 1, stratify = y_ML)
train_test_allmodels(X_train, X_test, y_train, y_test, 'no','yes','no')
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
***************************************************************************
              precision    recall  f1-score   support

           0       0.86      0.83      0.84       186
           1       0.88      0.87      0.87       184
           2       0.91      0.95      0.93       186
           3       0.94      0.95      0.94       186
           4       1.00      0.99      1.00       184

    accuracy                           0.92       926
   macro avg       0.92      0.92      0.92       926
weighted avg       0.92      0.92      0.92       926

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')
***************************************************************************
              precision    recall  f1-score   support

           0       0.98      0.84      0.90       186
           1       0.90      0.95      0.93       184
           2       0.95      1.00      0.97       186
           3       0.95      0.96      0.95       186
           4       0.98      1.00      0.99       184

    accuracy                           0.95       926
   macro avg       0.95      0.95      0.95       926
weighted avg       0.95      0.95      0.95       926

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)
***************************************************************************
              precision    recall  f1-score   support

           0       0.87      0.96      0.91       186
           1       0.98      0.89      0.93       184
           2       0.98      0.97      0.98       186
           3       0.95      0.95      0.95       186
           4       1.00      0.99      1.00       184

    accuracy                           0.95       926
   macro avg       0.95      0.95      0.95       926
weighted avg       0.95      0.95      0.95       926

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=1, splitter='best')
***************************************************************************
              precision    recall  f1-score   support

           0       0.81      0.79      0.80       186
           1       0.81      0.87      0.84       184
           2       0.93      0.92      0.93       186
           3       0.92      0.89      0.90       186
           4       0.99      0.99      0.99       184

    accuracy                           0.89       926
   macro avg       0.89      0.89      0.89       926
weighted avg       0.89      0.89      0.89       926

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)
***************************************************************************
              precision    recall  f1-score   support

           0       0.88      0.91      0.89       186
           1       0.93      0.91      0.92       184
           2       0.96      0.97      0.97       186
           3       0.97      0.95      0.96       186
           4       1.00      1.00      1.00       184

    accuracy                           0.95       926
   macro avg       0.95      0.95      0.95       926
weighted avg       0.95      0.95      0.95       926

BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                  max_features=1.0, max_samples=0.75, n_estimators=30,
                  n_jobs=None, oob_score=True, random_state=1, verbose=0,
                  warm_start=False)
***************************************************************************
              precision    recall  f1-score   support

           0       0.85      0.89      0.87       186
           1       0.94      0.90      0.92       184
           2       0.95      0.96      0.95       186
           3       0.95      0.92      0.93       186
           4       0.99      1.00      1.00       184

    accuracy                           0.94       926
   macro avg       0.94      0.94      0.94       926
weighted avg       0.94      0.94      0.94       926

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.25,
                   n_estimators=100, random_state=1)
***************************************************************************
              precision    recall  f1-score   support

           0       0.31      0.79      0.45       186
           1       0.57      0.40      0.47       184
           2       0.87      0.35      0.50       186
           3       0.71      0.56      0.63       186
           4       1.00      0.57      0.73       184

    accuracy                           0.53       926
   macro avg       0.69      0.53      0.55       926
weighted avg       0.69      0.53      0.55       926

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=50,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=1, subsample=1.0, tol=0.0001,
                           validation_fraction=0.2, verbose=0,
                           warm_start=False)
***************************************************************************
              precision    recall  f1-score   support

           0       0.74      0.79      0.76       186
           1       0.86      0.85      0.85       184
           2       0.94      0.95      0.94       186
           3       0.93      0.88      0.91       186
           4       1.00      0.98      0.99       184

    accuracy                           0.89       926
   macro avg       0.89      0.89      0.89       926
weighted avg       0.89      0.89      0.89       926

<catboost.core.CatBoostClassifier object at 0x7fe6e34eba50>
***************************************************************************
              precision    recall  f1-score   support

           0       0.90      0.91      0.91       186
           1       0.94      0.90      0.92       184
           2       0.97      0.98      0.98       186
           3       0.95      0.96      0.96       186
           4       0.99      0.99      0.99       184

    accuracy                           0.95       926
   macro avg       0.95      0.95      0.95       926
weighted avg       0.95      0.95      0.95       926

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0.4,
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=7, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softmax', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
***************************************************************************
              precision    recall  f1-score   support

           0       0.85      0.90      0.88       186
           1       0.93      0.87      0.90       184
           2       0.95      0.96      0.96       186
           3       0.95      0.95      0.95       186
           4       1.00      1.00      1.00       184

    accuracy                           0.94       926
   macro avg       0.94      0.94      0.94       926
weighted avg       0.94      0.94      0.94       926

Out[ ]:
Method Train Accuracy Test Accuracy Precision Recall F1-Score Multi-Class Logloss
1 LogisticRegression 0.969468 0.916847 0.916469 0.916847 0.916501 0.379206
2 KNeighborsClassifier 0.982437 0.950324 0.951646 0.950324 0.949567 0.711496
3 SVC 0.995947 0.952484 0.954556 0.952484 0.952690 0.147141
4 DecisionTreeClassifier 0.998919 0.892009 0.892950 0.892009 0.892201 3.624266
5 RandomForestClassifier 0.998379 0.947084 0.947634 0.947084 0.947252 0.342710
6 BaggingClassifier 0.997568 0.935205 0.935772 0.935205 0.935324 0.342162
7 AdaBoostClassifier 0.514726 0.533477 0.692508 0.533477 0.553171 1.313712
8 GradientBoostingClassifier 0.937855 0.890929 0.893619 0.890929 0.891923 0.551757
9 CatBoostClassifier 0.997838 0.951404 0.951445 0.951404 0.951304 0.252907
10 XGBClassifier 0.977574 0.936285 0.937181 0.936285 0.936377 0.290846
In [ ]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape
Out[ ]:
((3701, 750), (926, 750), (3701, 5), (926, 5))
In [ ]:
# #unigrams bigrams and trigrams
# X=x_ML_tfidf_3.drop(['Accident_Level'],axis=1)
# Y=x_ML_tfidf_3.Accident_Level
# X_train, X_test, y_train, y_test = train_test_split(X,  Y, test_size = 0.20, random_state = 1, stratify = y_ML)
# train_test_allmodels(X_train, X_test, y_train, y_test, 'no')

We can see that SVC gives us the best accuracy and the lowest loss as compared to the other machine learning models.

It is followed by the Catboost classifier which gives us the next best results.

Let us see what results we get using count vectorized data.

  1. Unigrams
In [ ]:
X=x_ML_cv_1.drop(['Accident_Level'],axis=1)
Y=x_ML_cv_1.Accident_Level
X_train, X_test, y_train, y_test = train_test_split(X,  Y, test_size = 0.20, random_state = 1, stratify = y_ML)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
train_test_allmodels(X_train, X_test, y_train, y_test, 'no','yes','no')
(3701, 750)
(926, 750)
(3701,)
(926,)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
***************************************************************************
              precision    recall  f1-score   support

           0       0.86      0.91      0.89       186
           1       0.92      0.87      0.90       184
           2       0.97      0.96      0.96       186
           3       0.94      0.97      0.95       186
           4       1.00      0.99      0.99       184

    accuracy                           0.94       926
   macro avg       0.94      0.94      0.94       926
weighted avg       0.94      0.94      0.94       926

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')
***************************************************************************
              precision    recall  f1-score   support

           0       0.82      0.88      0.85       186
           1       0.91      0.86      0.89       184
           2       0.96      0.97      0.96       186
           3       0.94      0.92      0.93       186
           4       1.00      0.98      0.99       184

    accuracy                           0.92       926
   macro avg       0.93      0.92      0.92       926
weighted avg       0.93      0.92      0.92       926

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)
***************************************************************************
              precision    recall  f1-score   support

           0       0.82      0.94      0.88       186
           1       0.96      0.84      0.90       184
           2       0.98      0.96      0.97       186
           3       0.94      0.95      0.95       186
           4       1.00      0.99      1.00       184

    accuracy                           0.94       926
   macro avg       0.94      0.94      0.94       926
weighted avg       0.94      0.94      0.94       926

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=1, splitter='best')
***************************************************************************
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       186
           1       0.91      0.86      0.89       184
           2       0.96      0.94      0.95       186
           3       0.90      0.93      0.91       186
           4       0.99      0.99      0.99       184

    accuracy                           0.92       926
   macro avg       0.92      0.92      0.92       926
weighted avg       0.92      0.92      0.92       926

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)
***************************************************************************
              precision    recall  f1-score   support

           0       0.87      0.95      0.91       186
           1       0.95      0.88      0.92       184
           2       0.98      0.97      0.98       186
           3       0.95      0.95      0.95       186
           4       1.00      1.00      1.00       184

    accuracy                           0.95       926
   macro avg       0.95      0.95      0.95       926
weighted avg       0.95      0.95      0.95       926

BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                  max_features=1.0, max_samples=0.75, n_estimators=30,
                  n_jobs=None, oob_score=True, random_state=1, verbose=0,
                  warm_start=False)
***************************************************************************
              precision    recall  f1-score   support

           0       0.90      0.96      0.93       186
           1       0.98      0.89      0.93       184
           2       0.97      0.97      0.97       186
           3       0.95      0.95      0.95       186
           4       0.98      0.99      0.99       184

    accuracy                           0.95       926
   macro avg       0.95      0.95      0.95       926
weighted avg       0.95      0.95      0.95       926

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.25,
                   n_estimators=100, random_state=1)
***************************************************************************
              precision    recall  f1-score   support

           0       0.21      0.25      0.23       186
           1       0.41      0.88      0.56       184
           2       1.00      0.20      0.33       186
           3       0.66      0.53      0.59       186
           4       1.00      0.68      0.81       184

    accuracy                           0.51       926
   macro avg       0.66      0.51      0.50       926
weighted avg       0.66      0.51      0.50       926

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=50,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=1, subsample=1.0, tol=0.0001,
                           validation_fraction=0.2, verbose=0,
                           warm_start=False)
***************************************************************************
              precision    recall  f1-score   support

           0       0.76      0.80      0.78       186
           1       0.87      0.83      0.85       184
           2       0.91      0.92      0.91       186
           3       0.91      0.91      0.91       186
           4       1.00      0.98      0.99       184

    accuracy                           0.89       926
   macro avg       0.89      0.89      0.89       926
weighted avg       0.89      0.89      0.89       926

<catboost.core.CatBoostClassifier object at 0x7fe6e3a714d0>
***************************************************************************
              precision    recall  f1-score   support

           0       0.88      0.95      0.91       186
           1       0.96      0.88      0.92       184
           2       0.98      0.98      0.98       186
           3       0.95      0.95      0.95       186
           4       1.00      1.00      1.00       184

    accuracy                           0.95       926
   macro avg       0.95      0.95      0.95       926
weighted avg       0.95      0.95      0.95       926

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0.4,
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=7, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softmax', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
***************************************************************************
              precision    recall  f1-score   support

           0       0.84      0.91      0.88       186
           1       0.92      0.88      0.90       184
           2       0.97      0.94      0.96       186
           3       0.96      0.95      0.95       186
           4       1.00      1.00      1.00       184

    accuracy                           0.94       926
   macro avg       0.94      0.94      0.94       926
weighted avg       0.94      0.94      0.94       926

Out[ ]:
Method Train Accuracy Test Accuracy Precision Recall F1-Score Multi-Class Logloss
1 LogisticRegression 0.996487 0.939525 0.940483 0.939525 0.939635 0.165660
2 KNeighborsClassifier 0.973250 0.924406 0.925884 0.924406 0.924808 0.921493
3 SVC 0.983248 0.937365 0.941650 0.937365 0.937797 0.184086
4 DecisionTreeClassifier 0.998649 0.917927 0.918842 0.917927 0.918103 2.655478
5 RandomForestClassifier 0.995677 0.950324 0.951876 0.950324 0.950458 0.338201
6 BaggingClassifier 0.997298 0.952484 0.953847 0.952484 0.952415 0.309629
7 AdaBoostClassifier 0.512834 0.506479 0.656274 0.506479 0.503143 1.265107
8 GradientBoostingClassifier 0.921373 0.887689 0.889048 0.887689 0.888155 0.560558
9 CatBoostClassifier 0.996487 0.951404 0.952840 0.951404 0.951416 0.188820
10 XGBClassifier 0.968927 0.936285 0.938061 0.936285 0.936728 0.294877
  1. Bi grams
In [ ]:
X=x_ML_cv_2.drop(['Accident_Level'],axis=1)
Y=x_ML_cv_2.Accident_Level
X_train, X_test, y_train, y_test = train_test_split(X,  Y, test_size = 0.20, random_state = 1, stratify = y_ML)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
train_test_allmodels(X_train, X_test, y_train, y_test, 'no','yes','no')
(3701, 750)
(926, 750)
(3701,)
(926,)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
***************************************************************************
              precision    recall  f1-score   support

           0       0.85      0.89      0.87       186
           1       0.92      0.85      0.88       184
           2       0.94      0.95      0.94       186
           3       0.95      0.97      0.96       186
           4       1.00      0.99      1.00       184

    accuracy                           0.93       926
   macro avg       0.93      0.93      0.93       926
weighted avg       0.93      0.93      0.93       926

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')
***************************************************************************
              precision    recall  f1-score   support

           0       0.84      0.89      0.86       186
           1       0.94      0.87      0.90       184
           2       0.92      0.97      0.94       186
           3       0.94      0.92      0.93       186
           4       1.00      0.98      0.99       184

    accuracy                           0.93       926
   macro avg       0.93      0.93      0.93       926
weighted avg       0.93      0.93      0.93       926

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)
***************************************************************************
              precision    recall  f1-score   support

           0       0.82      0.94      0.88       186
           1       0.95      0.86      0.90       184
           2       0.98      0.95      0.96       186
           3       0.95      0.94      0.94       186
           4       1.00      0.99      1.00       184

    accuracy                           0.94       926
   macro avg       0.94      0.94      0.94       926
weighted avg       0.94      0.94      0.94       926

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=1, splitter='best')
***************************************************************************
              precision    recall  f1-score   support

           0       0.81      0.86      0.83       186
           1       0.90      0.88      0.89       184
           2       0.94      0.92      0.93       186
           3       0.90      0.88      0.89       186
           4       0.99      1.00      1.00       184

    accuracy                           0.91       926
   macro avg       0.91      0.91      0.91       926
weighted avg       0.91      0.91      0.91       926

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)
***************************************************************************
              precision    recall  f1-score   support

           0       0.85      0.96      0.90       186
           1       0.97      0.89      0.93       184
           2       0.98      0.96      0.97       186
           3       0.94      0.92      0.93       186
           4       1.00      1.00      1.00       184

    accuracy                           0.94       926
   macro avg       0.95      0.94      0.95       926
weighted avg       0.95      0.94      0.95       926

BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                  max_features=1.0, max_samples=0.75, n_estimators=30,
                  n_jobs=None, oob_score=True, random_state=1, verbose=0,
                  warm_start=False)
***************************************************************************
              precision    recall  f1-score   support

           0       0.87      0.95      0.91       186
           1       0.95      0.88      0.91       184
           2       0.96      0.96      0.96       186
           3       0.95      0.92      0.94       186
           4       0.99      0.99      0.99       184

    accuracy                           0.94       926
   macro avg       0.94      0.94      0.94       926
weighted avg       0.94      0.94      0.94       926

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.25,
                   n_estimators=100, random_state=1)
***************************************************************************
              precision    recall  f1-score   support

           0       0.21      0.29      0.24       186
           1       0.35      0.80      0.49       184
           2       0.72      0.28      0.40       186
           3       0.91      0.17      0.29       186
           4       1.00      0.75      0.86       184

    accuracy                           0.46       926
   macro avg       0.64      0.46      0.46       926
weighted avg       0.64      0.46      0.46       926

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=50,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=1, subsample=1.0, tol=0.0001,
                           validation_fraction=0.2, verbose=0,
                           warm_start=False)
***************************************************************************
              precision    recall  f1-score   support

           0       0.75      0.83      0.79       186
           1       0.88      0.84      0.86       184
           2       0.94      0.91      0.93       186
           3       0.91      0.89      0.90       186
           4       1.00      0.98      0.99       184

    accuracy                           0.89       926
   macro avg       0.89      0.89      0.89       926
weighted avg       0.89      0.89      0.89       926

<catboost.core.CatBoostClassifier object at 0x7fe75573a210>
***************************************************************************
              precision    recall  f1-score   support

           0       0.91      0.94      0.92       186
           1       0.96      0.90      0.93       184
           2       0.97      0.98      0.98       186
           3       0.94      0.97      0.95       186
           4       1.00      0.99      1.00       184

    accuracy                           0.96       926
   macro avg       0.96      0.96      0.96       926
weighted avg       0.96      0.96      0.96       926

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0.4,
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=7, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softmax', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
***************************************************************************
              precision    recall  f1-score   support

           0       0.83      0.92      0.87       186
           1       0.95      0.89      0.92       184
           2       0.97      0.95      0.96       186
           3       0.96      0.93      0.95       186
           4       1.00      1.00      1.00       184

    accuracy                           0.94       926
   macro avg       0.94      0.94      0.94       926
weighted avg       0.94      0.94      0.94       926

Out[ ]:
Method Train Accuracy Test Accuracy Precision Recall F1-Score Multi-Class Logloss
1 LogisticRegression 0.994056 0.930886 0.931388 0.930886 0.930797 0.181403
2 KNeighborsClassifier 0.972170 0.925486 0.927147 0.925486 0.925727 1.258497
3 SVC 0.977844 0.935205 0.939476 0.935205 0.935933 0.190985
4 DecisionTreeClassifier 0.998919 0.907127 0.908346 0.907127 0.907493 2.994410
5 RandomForestClassifier 0.997568 0.944924 0.947762 0.944924 0.945312 0.280075
6 BaggingClassifier 0.997568 0.941685 0.943200 0.941685 0.941734 0.321494
7 AdaBoostClassifier 0.473115 0.457883 0.638986 0.457883 0.455261 1.278595
8 GradientBoostingClassifier 0.920562 0.889849 0.893489 0.889849 0.891104 0.560389
9 CatBoostClassifier 0.995947 0.955724 0.956241 0.955724 0.955673 0.186631
10 XGBClassifier 0.966766 0.937365 0.940405 0.937365 0.938056 0.302943
  1. Trigrams
In [ ]:
# X=x_ML_cv_3.drop(['Accident_Level'],axis=1)
# Y=x_ML_cv_3.Accident_Level
# X_train, X_test, y_train, y_test = train_test_split(X,  Y, test_size = 0.20, random_state = 1, stratify = y_ML)
# print(X_train.shape)
# print(X_test.shape)
# print(y_train.shape)
# print(y_test.shape)
# train_test_allmodels(X_train, X_test, y_train, y_test, 'no')

There is not much of a difference between the results of count vectorized data and tf idf vectorized data. In both cases, SVC performs the best followed by the catboost classifier. One more observation is that the data performs the best using tf idf vectorizer (bi-grams) with a training accuracy of 99% and a test accuracy of 96.1%. The precision, recall and F1 scores are also very good approx 96.1%, making it the best performed model.

We will now work on the deep learning data. Will pass the deep learning data to the below models:

  1. Simple Neural Network Model
  2. LSTM
  3. Bi-directional LSTM

First, we will embed our deep learning data using Glove embeddings

In [ ]:
my_corpus = []
for text in accident_safety_data_upsampled['Description_DL']:
    words = [word.lower() for word in word_tokenize(text)] 
    my_corpus.append(words)
num_words = len(my_corpus)
print(num_words)
4627
In [ ]:
accident_safety_data_upsampled.head()
Out[ ]:
Accident_Level Description Description_DL Description_ML
0 I piece get rid of the mandrillus leucophaeus re... piece get rid of the mandrillus leucophaeus re... piec get rid mandrillu leucophaeu retin rod ga...
1 I While removing the drill rod of the Jumbo 08 f... While removing the drill rod of the Jumbo for... remov drill rod jumbo mainten radio beam super...
2 I While removing the drill rod of the Jumbo 08 f... While removing the drill rod of the Jumbo for... remov drill rod jumbo mainten supervisor proce...
3 I During the energizing of a atomic number sulp... During the energizing of a atomic number sulp... energ atom number sulphid pump pipe decoupl su...
4 I During the activation of deoxyadenosine monoph... During the activation of deoxyadenosine monoph... activ deoxyadenosin monophosph sodium sulphid ...
In [ ]:
X = accident_safety_data_upsampled['Description_DL']
Y = accident_safety_data_upsampled['Accident_Level']
In [ ]:
#Labelling the column Accident_Level
Y = LabelEncoder().fit_transform(Y)
In [ ]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.20, random_state = 1, stratify = Y)
print('X_text_train shape : ({0})'.format(X_train.shape[0]))
print('y_text_train shape : ({0},)'.format(y_train.shape[0]))
print('X_text_test shape : ({0})'.format(X_test.shape[0]))
print('y_text_test shape : ({0},)'.format(y_test.shape[0]))
X_text_train shape : (3701)
y_text_train shape : (3701,)
X_text_test shape : (926)
y_text_test shape : (926,)

Since we will be passing this data to a deep learning model, we will have to one hot encode the Y variable.

In [ ]:
y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)

Step 1 : convert the words into thier corresponding numeric indexes.

In [ ]:
tokenizer = Tokenizer(num_words)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

Step 2: Since the length of the sentences returned by the tokenizer are of varying lengths, we will need to pad the sequences

In [ ]:
vocab_size = len(tokenizer.word_index) + 1
print("vocab_size:", vocab_size)
vocab_size: 7593
In [ ]:
max_length = 750
X_train = pad_sequences(X_train, padding='post', maxlen=max_length)
X_test = pad_sequences(X_test, padding='post', maxlen=max_length)
In [ ]:
#Let us now create our test and validation set as 50 50
# X_test, X_val, y_test, y_val = train_test_split(X_test,y_test, test_size = 0.5, random_state=2)
In [ ]:
print(X_train.shape)
print(X_test.shape)
# # print(X_val.shape)
print(y_train.shape)
print(y_test.shape)
# # print(y_val.shape)
(3701, 750)
(926, 750)
(3701, 5)
(926, 5)

Let us make a weight matrix of all words in corpus using pre-trained glove embeddings

In [ ]:
import numpy as np

embedding = {}
with open("/content/drive/MyDrive/Colab Notebooks/NLP/CapstoneProjectNLP/data/glove.6B.200d.txt") as file:
    for line in file:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], 'float32')
        embedding[word] = vectors
file.close()
In [ ]:
embedding_size = 200
embeddings_dictionary = dict()

embedding_matrix = np.zeros((vocab_size, embedding_size))
for i, word in tokenizer.index_word.items():
    if i < (num_words+1):
        vector = embedding.get(word)
        if vector is not None:
            embedding_matrix[i] = vector
In [ ]:
print(len(embedding.values()))
print(embedding_matrix.shape)
400000
(7593, 200)

Simple NN model

In [ ]:
epochs=20
In [ ]:
model = Sequential()
model.add(Dense(32, input_shape=(len(X_train[0]),), activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(len(y_train[0]), activation='softmax'))

# Compile model. Stochastic gradient descent with Nesterov accelerated gradient gives good results for this model
#sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

#fitting and saving the model 
hist = model.fit(np.array(X_train), np.array(y_train), validation_split=0.2 , epochs=epochs, batch_size=5, verbose=1)
Epoch 1/20
592/592 [==============================] - 16s 3ms/step - loss: 33.8174 - accuracy: 0.2110 - val_loss: 1.7383 - val_accuracy: 0.2348
Epoch 2/20
592/592 [==============================] - 1s 2ms/step - loss: 1.7583 - accuracy: 0.2298 - val_loss: 1.6428 - val_accuracy: 0.2227
Epoch 3/20
592/592 [==============================] - 1s 2ms/step - loss: 1.5889 - accuracy: 0.2237 - val_loss: 1.6443 - val_accuracy: 0.2281
Epoch 4/20
592/592 [==============================] - 1s 2ms/step - loss: 1.5795 - accuracy: 0.2261 - val_loss: 1.6269 - val_accuracy: 0.2348
Epoch 5/20
592/592 [==============================] - 1s 2ms/step - loss: 1.5616 - accuracy: 0.2530 - val_loss: 1.6062 - val_accuracy: 0.2348
Epoch 6/20
592/592 [==============================] - 1s 2ms/step - loss: 1.5477 - accuracy: 0.2502 - val_loss: 1.6523 - val_accuracy: 0.2497
Epoch 7/20
592/592 [==============================] - 1s 2ms/step - loss: 1.5415 - accuracy: 0.2606 - val_loss: 1.6899 - val_accuracy: 0.2470
Epoch 8/20
592/592 [==============================] - 1s 2ms/step - loss: 1.5336 - accuracy: 0.2532 - val_loss: 1.6076 - val_accuracy: 0.2443
Epoch 9/20
592/592 [==============================] - 1s 2ms/step - loss: 1.5256 - accuracy: 0.2989 - val_loss: 1.5634 - val_accuracy: 0.2780
Epoch 10/20
592/592 [==============================] - 1s 2ms/step - loss: 1.5120 - accuracy: 0.2965 - val_loss: 1.5391 - val_accuracy: 0.2794
Epoch 11/20
592/592 [==============================] - 1s 2ms/step - loss: 1.4833 - accuracy: 0.3021 - val_loss: 1.5250 - val_accuracy: 0.2955
Epoch 12/20
592/592 [==============================] - 1s 2ms/step - loss: 1.4659 - accuracy: 0.3298 - val_loss: 1.5182 - val_accuracy: 0.2982
Epoch 13/20
592/592 [==============================] - 1s 2ms/step - loss: 1.4489 - accuracy: 0.3331 - val_loss: 1.5302 - val_accuracy: 0.2861
Epoch 14/20
592/592 [==============================] - 1s 2ms/step - loss: 1.4417 - accuracy: 0.3422 - val_loss: 1.4883 - val_accuracy: 0.2888
Epoch 15/20
592/592 [==============================] - 1s 2ms/step - loss: 1.4211 - accuracy: 0.3494 - val_loss: 1.5417 - val_accuracy: 0.3090
Epoch 16/20
592/592 [==============================] - 1s 2ms/step - loss: 1.4131 - accuracy: 0.3599 - val_loss: 1.4548 - val_accuracy: 0.3158
Epoch 17/20
592/592 [==============================] - 1s 2ms/step - loss: 1.3955 - accuracy: 0.3634 - val_loss: 1.5085 - val_accuracy: 0.3036
Epoch 18/20
592/592 [==============================] - 1s 2ms/step - loss: 1.3976 - accuracy: 0.3551 - val_loss: 1.5017 - val_accuracy: 0.3036
Epoch 19/20
592/592 [==============================] - 1s 2ms/step - loss: 1.4118 - accuracy: 0.3555 - val_loss: 1.4770 - val_accuracy: 0.3090
Epoch 20/20
592/592 [==============================] - 1s 2ms/step - loss: 1.3666 - accuracy: 0.3823 - val_loss: 1.4981 - val_accuracy: 0.3293
In [ ]:
# evaluate the keras model
train_accuracy = model.evaluate(X_train, y_train, batch_size=5, verbose=0)
test_accuracy = model.evaluate(X_test, y_test, batch_size=5, verbose=0)
print(train_accuracy,test_accuracy)
[1.3767038583755493, 0.3882734477519989] [1.5830851793289185, 0.3466522693634033]
In [ ]:
epochs = range(len(hist.history['loss'])) # Get number of epochs

# plot loss learning curves
plt.plot(epochs, hist.history['loss'], label = 'train')
plt.plot(epochs, hist.history['val_loss'], label = 'test')
plt.legend(loc = 'upper right')
plt.title ('Training and validation loss')
Out[ ]:
Text(0.5, 1.0, 'Training and validation loss')

LSTM

In [ ]:
#embedding_layer = Embedding(vocab_size, embedding_size, weights=[embedding_matrix], trainable=False)(deep_inputs)
epochs=2
model = Sequential()
model.add(Embedding(vocab_size, embedding_size, weights=[embedding_matrix], trainable=False))
#model.add(SpatialDropout1D(0.01))
model.add(LSTM(32, dropout=0.01, recurrent_dropout=0.01))
model.add(Dense(10,activation='relu'))
model.add(Dense(5, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
print(model.summary())
history = model.fit(X_train, y_train, epochs=epochs, batch_size=5,validation_split=0.2)
WARNING:tensorflow:Layer lstm will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding (Embedding)        (None, None, 200)         1518600   
_________________________________________________________________
lstm (LSTM)                  (None, 32)                29824     
_________________________________________________________________
dense_4 (Dense)              (None, 10)                330       
_________________________________________________________________
dense_5 (Dense)              (None, 5)                 55        
=================================================================
Total params: 1,548,809
Trainable params: 30,209
Non-trainable params: 1,518,600
_________________________________________________________________
None
Epoch 1/2
592/592 [==============================] - 707s 1s/step - loss: 1.6097 - acc: 0.2009 - val_loss: 1.6096 - val_acc: 0.2038
Epoch 2/2
592/592 [==============================] - 701s 1s/step - loss: 1.6096 - acc: 0.2093 - val_loss: 1.6097 - val_acc: 0.1849
In [ ]:
# evaluate the keras model
train_accuracy = model.evaluate(X_train, y_train, batch_size=5, verbose=0)
test_accuracy = model.evaluate(X_test, y_test, batch_size=5, verbose=0)
print(train_accuracy,test_accuracy)
[1.6094458103179932, 0.2002161592245102] [1.609437346458435, 0.20086392760276794]
In [ ]:
epochs = range(len(history.history['loss'])) # Get number of epochs

# plot loss learning curves
plt.plot(epochs, history.history['loss'], label = 'train')
plt.plot(epochs, history.history['val_loss'], label = 'test')
plt.legend(loc = 'upper right')
plt.title ('Training and validation loss')
Out[ ]:
Text(0.5, 1.0, 'Training and validation loss')
In [ ]:
# #Pickle the model for future use
# model.save('lstm.h5')

Bi Directional LSTM

In [ ]:
class Metrics(tf.keras.callbacks.Callback):

    def __init__(self, validation_data=()):
        super().__init__()
        self.validation_data = validation_data

    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []

    def on_epoch_end(self, epoch, logs={}):
        xVal, yVal, target_type = self.validation_data
        if target_type == 'multi_class':
          val_predict_classes = model.predict_classes(xVal, verbose=0) # Multiclass
        else:
          val_predict_classes = (np.asarray(self.model.predict(xVal))).round() # Multilabel
        
        
        val_targ = yVal

        _val_f1 = f1_score(val_targ, val_predict_classes, average='micro')
        _val_recall = recall_score(val_targ, val_predict_classes, average='micro')
        _val_precision = precision_score(val_targ, val_predict_classes, average='micro')
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        #print("— train_f1: %f — train_precision: %f — train_recall %f" % (_val_f1, _val_precision, _val_recall))
        return
In [ ]:
# Build a Bi-directional LSTM Neural Network
epochs=20
deep_inputs = Input(shape=(max_length,))
embedding_layer = Embedding(vocab_size, embedding_size, weights=[embedding_matrix], trainable=False)(deep_inputs)

LSTM_Layer_1 = Bidirectional(LSTM(128, return_sequences = True))(embedding_layer)
max_pool_layer_1 = GlobalMaxPool1D()(LSTM_Layer_1)
# drop_out_layer_1 = Dropout(0.5, input_shape = (256,))(max_pool_layer_1)
# dense_layer_1 = Dense(128, activation = 'relu')(drop_out_layer_1)
# drop_out_layer_2 = Dropout(0.5, input_shape = (128,))(dense_layer_1)
# dense_layer_2 = Dense(64, activation = 'relu')(max_pool_layer_1)
# drop_out_layer_3 = Dropout(0.01, input_shape = (64,))(dense_layer_2)
#(drop_out_layer_3)
dense_layer_3 = Dense(32, activation = 'relu')(max_pool_layer_1)
drop_out_layer_4 = Dropout(0.01, input_shape = (32,))(dense_layer_3)

dense_layer_4 = Dense(10, activation = 'relu')(drop_out_layer_4)
drop_out_layer_5 = Dropout(0.01, input_shape = (10,))(dense_layer_4)

dense_layer_5 = Dense(5, activation='softmax')(drop_out_layer_5)
#dense_layer_3 = Dense(5, activation='softmax')(drop_out_layer_3)

# LSTM_Layer_1 = LSTM(128)(embedding_layer)
# dense_layer_1 = Dense(5, activation='softmax')(LSTM_Layer_1)
# model = Model(inputs=deep_inputs, outputs=dense_layer_1)

model = Model(inputs=deep_inputs, outputs=dense_layer_5)
#model = Model(inputs=deep_inputs, outputs=dense_layer_3)

opt = SGD(learning_rate=0.001, momentum=0.9)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['acc'])

print(model.summary())
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_1 (InputLayer)         [(None, 750)]             0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 750, 200)          1518600   
_________________________________________________________________
bidirectional (Bidirectional (None, 750, 256)          336896    
_________________________________________________________________
global_max_pooling1d (Global (None, 256)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 32)                8224      
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 10)                330       
_________________________________________________________________
dropout_1 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 5)                 55        
=================================================================
Total params: 1,864,105
Trainable params: 345,505
Non-trainable params: 1,518,600
_________________________________________________________________
None
In [ ]:
# Use earlystopping
# callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5, min_delta=0.001)
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=7, min_delta=1E-3)
rlrp = ReduceLROnPlateau(monitor='val_loss', factor=0.0001, patience=5, min_delta=1E-4)

target_type = 'multi_label'
metrics = Metrics(validation_data=(X_train, y_train, target_type))

# fit the keras model on the dataset
training_history = model.fit(X_train, y_train, epochs=epochs, batch_size=8, verbose=1,validation_split=0.2, callbacks=[rlrp, metrics])
Epoch 1/20
370/370 [==============================] - 32s 63ms/step - loss: 1.6082 - acc: 0.2220 - val_loss: 1.6015 - val_acc: 0.2767
Epoch 2/20
  3/370 [..............................] - ETA: 20s - loss: 1.6016 - acc: 0.2917
/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning:

Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.

370/370 [==============================] - 22s 60ms/step - loss: 1.5990 - acc: 0.2522 - val_loss: 1.5939 - val_acc: 0.2065
Epoch 3/20
370/370 [==============================] - 22s 60ms/step - loss: 1.5813 - acc: 0.2842 - val_loss: 1.5658 - val_acc: 0.3050
Epoch 4/20
370/370 [==============================] - 22s 59ms/step - loss: 1.5562 - acc: 0.3252 - val_loss: 1.5111 - val_acc: 0.3941
Epoch 5/20
370/370 [==============================] - 23s 61ms/step - loss: 1.5018 - acc: 0.4219 - val_loss: 1.4339 - val_acc: 0.3819
Epoch 6/20
370/370 [==============================] - 22s 59ms/step - loss: 1.3875 - acc: 0.4804 - val_loss: 1.3747 - val_acc: 0.4507
Epoch 7/20
370/370 [==============================] - 22s 59ms/step - loss: 1.2308 - acc: 0.6015 - val_loss: 1.0861 - val_acc: 0.5574
Epoch 8/20
370/370 [==============================] - 22s 60ms/step - loss: 1.0148 - acc: 0.6927 - val_loss: 0.8358 - val_acc: 0.6856
Epoch 9/20
370/370 [==============================] - 22s 59ms/step - loss: 0.7338 - acc: 0.7829 - val_loss: 0.6483 - val_acc: 0.7733
Epoch 10/20
370/370 [==============================] - 22s 60ms/step - loss: 0.5608 - acc: 0.8402 - val_loss: 0.5016 - val_acc: 0.8596
Epoch 11/20
370/370 [==============================] - 22s 59ms/step - loss: 0.3957 - acc: 0.8934 - val_loss: 0.3898 - val_acc: 0.8704
Epoch 12/20
370/370 [==============================] - 22s 58ms/step - loss: 0.2736 - acc: 0.9358 - val_loss: 0.2562 - val_acc: 0.9420
Epoch 13/20
370/370 [==============================] - 22s 61ms/step - loss: 0.1888 - acc: 0.9577 - val_loss: 0.2466 - val_acc: 0.9258
Epoch 14/20
370/370 [==============================] - 22s 60ms/step - loss: 0.1495 - acc: 0.9737 - val_loss: 0.1761 - val_acc: 0.9541
Epoch 15/20
370/370 [==============================] - 22s 59ms/step - loss: 0.1195 - acc: 0.9747 - val_loss: 0.1746 - val_acc: 0.9433
Epoch 16/20
370/370 [==============================] - 22s 59ms/step - loss: 0.0908 - acc: 0.9821 - val_loss: 0.1753 - val_acc: 0.9474
Epoch 17/20
370/370 [==============================] - 23s 61ms/step - loss: 0.0849 - acc: 0.9815 - val_loss: 0.1334 - val_acc: 0.9663
Epoch 18/20
370/370 [==============================] - 22s 59ms/step - loss: 0.0713 - acc: 0.9867 - val_loss: 0.3866 - val_acc: 0.8853
Epoch 19/20
370/370 [==============================] - 22s 60ms/step - loss: 0.0683 - acc: 0.9844 - val_loss: 0.1609 - val_acc: 0.9487
Epoch 20/20
370/370 [==============================] - 22s 59ms/step - loss: 0.0652 - acc: 0.9817 - val_loss: 0.1294 - val_acc: 0.9582
In [ ]:
# evaluate the keras model
train_accuracy = model.evaluate(X_train, y_train, batch_size=5, verbose=0)
test_accuracy = model.evaluate(X_test, y_test, batch_size=5, verbose=0)
print(train_accuracy,test_accuracy)
[0.039383333176374435, 0.9913536906242371] [0.15492010116577148, 0.946004331111908]
In [ ]:
epochs = range(len(training_history.history['loss'])) # Get number of epochs

# plot loss learning curves
plt.plot(epochs, training_history.history['loss'], label = 'train')
plt.plot(epochs, training_history.history['val_loss'], label = 'test')
plt.legend(loc = 'upper right')
plt.title ('Training and validation loss')
Out[ ]:
Text(0.5, 1.0, 'Training and validation loss')
In [ ]:
epochs = range(len(training_history.history['val_acc'])) # Get number of epochs

# plot loss learning curves
plt.plot(epochs, training_history.history['acc'], label = 'train')
plt.plot(epochs, training_history.history['val_acc'], label = 'test')
plt.legend(loc = 'upper right')
plt.title ('Training and validation accuracy')
Out[ ]:
Text(0.5, 1.0, 'Training and validation accuracy')
In [ ]:
y_pred=model.predict(X_test)
In [ ]:
y_pred
Out[ ]:
array([[6.7976589e-04, 4.0395207e-07, 3.5001010e-02, 9.6431077e-01,
        8.0387690e-06],
       [6.7823561e-04, 9.3540820e-10, 1.9017217e-03, 9.9741781e-01,
        2.2232496e-06],
       [4.2721367e-06, 1.9962918e-05, 9.9842024e-01, 7.0546416e-04,
        8.5005153e-04],
       ...,
       [8.7916929e-01, 8.7064531e-05, 1.5881864e-03, 1.1463200e-01,
        4.5233965e-03],
       [1.4944122e-02, 7.1788030e-08, 2.0704451e-03, 9.8294157e-01,
        4.3683121e-05],
       [1.1618224e-06, 7.0920549e-05, 5.1054178e-04, 1.5194407e-10,
        9.9941742e-01]], dtype=float32)
In [ ]:
#Pickle the model for future use
model.save('bidirectional_lstm_model.h5')

Conclusion:

We can see from the above scores that the bi directional LSTM model has performed the best out of all machine learning and deep learning models. The accuracy is very high and the loss is also very low. Since the bidirectional LSTM performed the best we will be working with the chatbot using this model . Let us first pickle the best machine learning model(SVC) and deep learning model(Bi Direcional LSTM).

SVC with unigrams has given us the best training (97% approx) and test (96% approx) scores hence we have picked this model for machine learning.The code for the pickled file is in the file train_ml_model.py.

Let us now build the UI for a chatbot.

In [ ]:
!pip install chatterbot-corpus
Collecting chatterbot-corpus
  Downloading chatterbot_corpus-1.2.0-py2.py3-none-any.whl (117 kB)
     |████████████████████████████████| 117 kB 7.6 MB/s 
Requirement already satisfied: PyYAML<4.0,>=3.12 in /usr/local/lib/python3.7/dist-packages (from chatterbot-corpus) (3.13)
Installing collected packages: chatterbot-corpus
Successfully installed chatterbot-corpus-1.2.0
In [ ]:
!python app.py
2021-07-26 17:48:40.530036: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
/usr/local/lib/python3.7/dist-packages/sklearn/externals/joblib/__init__.py:15: FutureWarning:

sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.

2021-07-26 17:48:42.244131: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-07-26 17:48:42.257039: E tensorflow/stream_executor/cuda/cuda_driver.cc:328] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2021-07-26 17:48:42.257101: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (64f6cbdcb51c): /proc/driver/nvidia/version does not exist
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
 * Serving Flask app "app" (lazy loading)
 * Environment: production
   WARNING: This is a development server. Do not use it in a production deployment.
   Use a production WSGI server instead.
 * Debug mode: on
 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
 * Restarting with stat
2021-07-26 17:49:02.564544: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
/usr/local/lib/python3.7/dist-packages/sklearn/externals/joblib/__init__.py:15: FutureWarning:

sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.

 * Running on http://d0b57d8a40cc.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040
2021-07-26 17:49:04.697223: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-07-26 17:49:04.710380: E tensorflow/stream_executor/cuda/cuda_driver.cc:328] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2021-07-26 17:49:04.710445: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (64f6cbdcb51c): /proc/driver/nvidia/version does not exist
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
List Trainer: [####################] 100%
 * Debugger is active!
 * Debugger PIN: 311-672-497
127.0.0.1 - - [26/Jul/2021 17:49:21] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [26/Jul/2021 17:49:21] "GET / HTTP/1.1" 200 -
 * Running on http://d0b57d8a40cc.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040
127.0.0.1 - - [26/Jul/2021 17:52:26] "POST /upload HTTP/1.1" 200 -
upsampling complete!
upsampling complete!
upsampling complete!
upsampling complete!
upsampling complete!
127.0.0.1 - - [26/Jul/2021 17:53:48] "POST /augment HTTP/1.1" 200 -
127.0.0.1 - - [26/Jul/2021 17:55:34] "POST /clean_dl_data HTTP/1.1" 200 -
hse_data.csv
127.0.0.1 - - [26/Jul/2021 17:55:59] "POST /clean_ml_data HTTP/1.1" 200 -
hse_data.csv
750
127.0.0.1 - - [26/Jul/2021 18:00:19] "POST /load_ml_models HTTP/1.1" 500 -
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/flask/app.py", line 2464, in __call__
    return self.wsgi_app(environ, start_response)
  File "/usr/local/lib/python3.7/dist-packages/flask/app.py", line 2450, in wsgi_app
    response = self.handle_exception(e)
  File "/usr/local/lib/python3.7/dist-packages/flask/app.py", line 1867, in handle_exception
    reraise(exc_type, exc_value, tb)
  File "/usr/local/lib/python3.7/dist-packages/flask/_compat.py", line 39, in reraise
    raise value
  File "/usr/local/lib/python3.7/dist-packages/flask/app.py", line 2447, in wsgi_app
    response = self.full_dispatch_request()
  File "/usr/local/lib/python3.7/dist-packages/flask/app.py", line 1952, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "/usr/local/lib/python3.7/dist-packages/flask/app.py", line 1821, in handle_user_exception
    reraise(exc_type, exc_value, tb)
  File "/usr/local/lib/python3.7/dist-packages/flask/_compat.py", line 39, in reraise
    raise value
  File "/usr/local/lib/python3.7/dist-packages/flask/app.py", line 1950, in full_dispatch_request
    rv = self.dispatch_request()
  File "/usr/local/lib/python3.7/dist-packages/flask/app.py", line 1936, in dispatch_request
    return self.view_functions[rule.endpoint](**req.view_args)
  File "/content/app.py", line 149, in load_ml_models
    df = ml_models(upsampled_result_df,num_features)
  File "/content/automation.py", line 119, in ml_models
    results_df = train_test_allmodels(X_train, X_test, y_train, y_test, 'no','no','no')
  File "/content/train_ml_model.py", line 113, in train_test_allmodels
    reg_resultsDf = train_test_model(classifier, name, X_train_common, X_test_common, y_train, y_test, 'none', i, scale, report, save_model)
  File "/content/train_ml_model.py", line 35, in train_test_model
    model.fit(X_train, y_train) # Fit the model on Training set
  File "/usr/local/lib/python3.7/dist-packages/catboost/core.py", line 4675, in fit
    silent, early_stopping_rounds, save_snapshot, snapshot_file, snapshot_interval, init_model, callbacks, log_cout, log_cerr)
  File "/usr/local/lib/python3.7/dist-packages/catboost/core.py", line 1999, in _fit
    train_params["init_model"]
  File "/usr/local/lib/python3.7/dist-packages/catboost/core.py", line 1425, in _train
    self._object._train(train_pool, test_pool, params, allow_clear_pool, init_model._object if init_model else None)
  File "_catboost.pyx", line 4346, in _catboost._CatBoost._train
    
  File "_catboost.pyx", line 4395, in _catboost._CatBoost._train
    
_catboost.CatBoostError: catboost/cuda/cuda_lib/cuda_base.h:281: CUDA error 100: no CUDA-capable device is detected
In [ ]: